VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 95466

Last change on this file since 95466 was 95460, checked in by vboxsync, 3 years ago

VMM/IEM: Untested crc32 implementation. bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 308.1 KB
Line 
1/* $Id: IEMAllAImplC.cpp 95460 2022-06-30 12:52:13Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*********************************************************************************************************************************
20* Header Files *
21*********************************************************************************************************************************/
22#include "IEMInternal.h"
23#include <VBox/vmm/vmcc.h>
24#include <iprt/errcore.h>
25#include <iprt/x86.h>
26#include <iprt/uint128.h>
27#include <iprt/uint256.h>
28#include <iprt/crc.h>
29
30RT_C_DECLS_BEGIN
31#include <softfloat.h>
32RT_C_DECLS_END
33
34
35/*********************************************************************************************************************************
36* Defined Constants And Macros *
37*********************************************************************************************************************************/
38/** @def IEM_WITHOUT_ASSEMBLY
39 * Enables all the code in this file.
40 */
41#if !defined(IEM_WITHOUT_ASSEMBLY)
42# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
43# define IEM_WITHOUT_ASSEMBLY
44# endif
45#endif
46/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
47#ifdef IEM_WITH_ASSEMBLY
48# undef IEM_WITHOUT_ASSEMBLY
49#endif
50
51/**
52 * Calculates the signed flag value given a result and it's bit width.
53 *
54 * The signed flag (SF) is a duplication of the most significant bit in the
55 * result.
56 *
57 * @returns X86_EFL_SF or 0.
58 * @param a_uResult Unsigned result value.
59 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
60 */
61#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
62 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
63
64/**
65 * Calculates the zero flag value given a result.
66 *
67 * The zero flag (ZF) indicates whether the result is zero or not.
68 *
69 * @returns X86_EFL_ZF or 0.
70 * @param a_uResult Unsigned result value.
71 */
72#define X86_EFL_CALC_ZF(a_uResult) \
73 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
74
75/**
76 * Extracts the OF flag from a OF calculation result.
77 *
78 * These are typically used by concating with a bitcount. The problem is that
79 * 8-bit values needs shifting in the other direction than the others.
80 */
81#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
82#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
83#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
84#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
85
86/**
87 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
88 *
89 * @returns Status bits.
90 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
91 * @param a_uResult Unsigned result value.
92 * @param a_uSrc The source value (for AF calc).
93 * @param a_uDst The original destination value (for AF calc).
94 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
95 * @param a_CfExpr Bool expression for the carry flag (CF).
96 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
97 */
98#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
99 do { \
100 uint32_t fEflTmp = *(a_pfEFlags); \
101 fEflTmp &= ~X86_EFL_STATUS_BITS; \
102 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
103 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
104 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
105 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
106 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
107 \
108 /* Overflow during ADDition happens when both inputs have the same signed \
109 bit value and the result has a different sign bit value. \
110 \
111 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
112 follows that for SUBtraction the signed bit value must differ between \
113 the two inputs and the result's signed bit diff from the first input. \
114 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
115 \
116 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
117 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
118 & RT_BIT_64(a_cBitsWidth - 1)) \
119 & ((a_uResult) ^ (a_uDst)) ); \
120 *(a_pfEFlags) = fEflTmp; \
121 } while (0)
122
123/**
124 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
125 *
126 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
127 * undefined. We do not set AF, as that seems to make the most sense (which
128 * probably makes it the most wrong in real life).
129 *
130 * @returns Status bits.
131 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
132 * @param a_uResult Unsigned result value.
133 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
134 * @param a_fExtra Additional bits to set.
135 */
136#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
137 do { \
138 uint32_t fEflTmp = *(a_pfEFlags); \
139 fEflTmp &= ~X86_EFL_STATUS_BITS; \
140 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
141 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
142 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
143 fEflTmp |= (a_fExtra); \
144 *(a_pfEFlags) = fEflTmp; \
145 } while (0)
146
147
148/*********************************************************************************************************************************
149* Global Variables *
150*********************************************************************************************************************************/
151/**
152 * Parity calculation table.
153 *
154 * This is also used by iemAllAImpl.asm.
155 *
156 * The generator code:
157 * @code
158 * #include <stdio.h>
159 *
160 * int main()
161 * {
162 * unsigned b;
163 * for (b = 0; b < 256; b++)
164 * {
165 * int cOnes = ( b & 1)
166 * + ((b >> 1) & 1)
167 * + ((b >> 2) & 1)
168 * + ((b >> 3) & 1)
169 * + ((b >> 4) & 1)
170 * + ((b >> 5) & 1)
171 * + ((b >> 6) & 1)
172 * + ((b >> 7) & 1);
173 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
174 * b,
175 * (b >> 7) & 1,
176 * (b >> 6) & 1,
177 * (b >> 5) & 1,
178 * (b >> 4) & 1,
179 * (b >> 3) & 1,
180 * (b >> 2) & 1,
181 * (b >> 1) & 1,
182 * b & 1,
183 * cOnes & 1 ? "0" : "X86_EFL_PF");
184 * }
185 * return 0;
186 * }
187 * @endcode
188 */
189uint8_t const g_afParity[256] =
190{
191 /* 0000 = 00000000b */ X86_EFL_PF,
192 /* 0x01 = 00000001b */ 0,
193 /* 0x02 = 00000010b */ 0,
194 /* 0x03 = 00000011b */ X86_EFL_PF,
195 /* 0x04 = 00000100b */ 0,
196 /* 0x05 = 00000101b */ X86_EFL_PF,
197 /* 0x06 = 00000110b */ X86_EFL_PF,
198 /* 0x07 = 00000111b */ 0,
199 /* 0x08 = 00001000b */ 0,
200 /* 0x09 = 00001001b */ X86_EFL_PF,
201 /* 0x0a = 00001010b */ X86_EFL_PF,
202 /* 0x0b = 00001011b */ 0,
203 /* 0x0c = 00001100b */ X86_EFL_PF,
204 /* 0x0d = 00001101b */ 0,
205 /* 0x0e = 00001110b */ 0,
206 /* 0x0f = 00001111b */ X86_EFL_PF,
207 /* 0x10 = 00010000b */ 0,
208 /* 0x11 = 00010001b */ X86_EFL_PF,
209 /* 0x12 = 00010010b */ X86_EFL_PF,
210 /* 0x13 = 00010011b */ 0,
211 /* 0x14 = 00010100b */ X86_EFL_PF,
212 /* 0x15 = 00010101b */ 0,
213 /* 0x16 = 00010110b */ 0,
214 /* 0x17 = 00010111b */ X86_EFL_PF,
215 /* 0x18 = 00011000b */ X86_EFL_PF,
216 /* 0x19 = 00011001b */ 0,
217 /* 0x1a = 00011010b */ 0,
218 /* 0x1b = 00011011b */ X86_EFL_PF,
219 /* 0x1c = 00011100b */ 0,
220 /* 0x1d = 00011101b */ X86_EFL_PF,
221 /* 0x1e = 00011110b */ X86_EFL_PF,
222 /* 0x1f = 00011111b */ 0,
223 /* 0x20 = 00100000b */ 0,
224 /* 0x21 = 00100001b */ X86_EFL_PF,
225 /* 0x22 = 00100010b */ X86_EFL_PF,
226 /* 0x23 = 00100011b */ 0,
227 /* 0x24 = 00100100b */ X86_EFL_PF,
228 /* 0x25 = 00100101b */ 0,
229 /* 0x26 = 00100110b */ 0,
230 /* 0x27 = 00100111b */ X86_EFL_PF,
231 /* 0x28 = 00101000b */ X86_EFL_PF,
232 /* 0x29 = 00101001b */ 0,
233 /* 0x2a = 00101010b */ 0,
234 /* 0x2b = 00101011b */ X86_EFL_PF,
235 /* 0x2c = 00101100b */ 0,
236 /* 0x2d = 00101101b */ X86_EFL_PF,
237 /* 0x2e = 00101110b */ X86_EFL_PF,
238 /* 0x2f = 00101111b */ 0,
239 /* 0x30 = 00110000b */ X86_EFL_PF,
240 /* 0x31 = 00110001b */ 0,
241 /* 0x32 = 00110010b */ 0,
242 /* 0x33 = 00110011b */ X86_EFL_PF,
243 /* 0x34 = 00110100b */ 0,
244 /* 0x35 = 00110101b */ X86_EFL_PF,
245 /* 0x36 = 00110110b */ X86_EFL_PF,
246 /* 0x37 = 00110111b */ 0,
247 /* 0x38 = 00111000b */ 0,
248 /* 0x39 = 00111001b */ X86_EFL_PF,
249 /* 0x3a = 00111010b */ X86_EFL_PF,
250 /* 0x3b = 00111011b */ 0,
251 /* 0x3c = 00111100b */ X86_EFL_PF,
252 /* 0x3d = 00111101b */ 0,
253 /* 0x3e = 00111110b */ 0,
254 /* 0x3f = 00111111b */ X86_EFL_PF,
255 /* 0x40 = 01000000b */ 0,
256 /* 0x41 = 01000001b */ X86_EFL_PF,
257 /* 0x42 = 01000010b */ X86_EFL_PF,
258 /* 0x43 = 01000011b */ 0,
259 /* 0x44 = 01000100b */ X86_EFL_PF,
260 /* 0x45 = 01000101b */ 0,
261 /* 0x46 = 01000110b */ 0,
262 /* 0x47 = 01000111b */ X86_EFL_PF,
263 /* 0x48 = 01001000b */ X86_EFL_PF,
264 /* 0x49 = 01001001b */ 0,
265 /* 0x4a = 01001010b */ 0,
266 /* 0x4b = 01001011b */ X86_EFL_PF,
267 /* 0x4c = 01001100b */ 0,
268 /* 0x4d = 01001101b */ X86_EFL_PF,
269 /* 0x4e = 01001110b */ X86_EFL_PF,
270 /* 0x4f = 01001111b */ 0,
271 /* 0x50 = 01010000b */ X86_EFL_PF,
272 /* 0x51 = 01010001b */ 0,
273 /* 0x52 = 01010010b */ 0,
274 /* 0x53 = 01010011b */ X86_EFL_PF,
275 /* 0x54 = 01010100b */ 0,
276 /* 0x55 = 01010101b */ X86_EFL_PF,
277 /* 0x56 = 01010110b */ X86_EFL_PF,
278 /* 0x57 = 01010111b */ 0,
279 /* 0x58 = 01011000b */ 0,
280 /* 0x59 = 01011001b */ X86_EFL_PF,
281 /* 0x5a = 01011010b */ X86_EFL_PF,
282 /* 0x5b = 01011011b */ 0,
283 /* 0x5c = 01011100b */ X86_EFL_PF,
284 /* 0x5d = 01011101b */ 0,
285 /* 0x5e = 01011110b */ 0,
286 /* 0x5f = 01011111b */ X86_EFL_PF,
287 /* 0x60 = 01100000b */ X86_EFL_PF,
288 /* 0x61 = 01100001b */ 0,
289 /* 0x62 = 01100010b */ 0,
290 /* 0x63 = 01100011b */ X86_EFL_PF,
291 /* 0x64 = 01100100b */ 0,
292 /* 0x65 = 01100101b */ X86_EFL_PF,
293 /* 0x66 = 01100110b */ X86_EFL_PF,
294 /* 0x67 = 01100111b */ 0,
295 /* 0x68 = 01101000b */ 0,
296 /* 0x69 = 01101001b */ X86_EFL_PF,
297 /* 0x6a = 01101010b */ X86_EFL_PF,
298 /* 0x6b = 01101011b */ 0,
299 /* 0x6c = 01101100b */ X86_EFL_PF,
300 /* 0x6d = 01101101b */ 0,
301 /* 0x6e = 01101110b */ 0,
302 /* 0x6f = 01101111b */ X86_EFL_PF,
303 /* 0x70 = 01110000b */ 0,
304 /* 0x71 = 01110001b */ X86_EFL_PF,
305 /* 0x72 = 01110010b */ X86_EFL_PF,
306 /* 0x73 = 01110011b */ 0,
307 /* 0x74 = 01110100b */ X86_EFL_PF,
308 /* 0x75 = 01110101b */ 0,
309 /* 0x76 = 01110110b */ 0,
310 /* 0x77 = 01110111b */ X86_EFL_PF,
311 /* 0x78 = 01111000b */ X86_EFL_PF,
312 /* 0x79 = 01111001b */ 0,
313 /* 0x7a = 01111010b */ 0,
314 /* 0x7b = 01111011b */ X86_EFL_PF,
315 /* 0x7c = 01111100b */ 0,
316 /* 0x7d = 01111101b */ X86_EFL_PF,
317 /* 0x7e = 01111110b */ X86_EFL_PF,
318 /* 0x7f = 01111111b */ 0,
319 /* 0x80 = 10000000b */ 0,
320 /* 0x81 = 10000001b */ X86_EFL_PF,
321 /* 0x82 = 10000010b */ X86_EFL_PF,
322 /* 0x83 = 10000011b */ 0,
323 /* 0x84 = 10000100b */ X86_EFL_PF,
324 /* 0x85 = 10000101b */ 0,
325 /* 0x86 = 10000110b */ 0,
326 /* 0x87 = 10000111b */ X86_EFL_PF,
327 /* 0x88 = 10001000b */ X86_EFL_PF,
328 /* 0x89 = 10001001b */ 0,
329 /* 0x8a = 10001010b */ 0,
330 /* 0x8b = 10001011b */ X86_EFL_PF,
331 /* 0x8c = 10001100b */ 0,
332 /* 0x8d = 10001101b */ X86_EFL_PF,
333 /* 0x8e = 10001110b */ X86_EFL_PF,
334 /* 0x8f = 10001111b */ 0,
335 /* 0x90 = 10010000b */ X86_EFL_PF,
336 /* 0x91 = 10010001b */ 0,
337 /* 0x92 = 10010010b */ 0,
338 /* 0x93 = 10010011b */ X86_EFL_PF,
339 /* 0x94 = 10010100b */ 0,
340 /* 0x95 = 10010101b */ X86_EFL_PF,
341 /* 0x96 = 10010110b */ X86_EFL_PF,
342 /* 0x97 = 10010111b */ 0,
343 /* 0x98 = 10011000b */ 0,
344 /* 0x99 = 10011001b */ X86_EFL_PF,
345 /* 0x9a = 10011010b */ X86_EFL_PF,
346 /* 0x9b = 10011011b */ 0,
347 /* 0x9c = 10011100b */ X86_EFL_PF,
348 /* 0x9d = 10011101b */ 0,
349 /* 0x9e = 10011110b */ 0,
350 /* 0x9f = 10011111b */ X86_EFL_PF,
351 /* 0xa0 = 10100000b */ X86_EFL_PF,
352 /* 0xa1 = 10100001b */ 0,
353 /* 0xa2 = 10100010b */ 0,
354 /* 0xa3 = 10100011b */ X86_EFL_PF,
355 /* 0xa4 = 10100100b */ 0,
356 /* 0xa5 = 10100101b */ X86_EFL_PF,
357 /* 0xa6 = 10100110b */ X86_EFL_PF,
358 /* 0xa7 = 10100111b */ 0,
359 /* 0xa8 = 10101000b */ 0,
360 /* 0xa9 = 10101001b */ X86_EFL_PF,
361 /* 0xaa = 10101010b */ X86_EFL_PF,
362 /* 0xab = 10101011b */ 0,
363 /* 0xac = 10101100b */ X86_EFL_PF,
364 /* 0xad = 10101101b */ 0,
365 /* 0xae = 10101110b */ 0,
366 /* 0xaf = 10101111b */ X86_EFL_PF,
367 /* 0xb0 = 10110000b */ 0,
368 /* 0xb1 = 10110001b */ X86_EFL_PF,
369 /* 0xb2 = 10110010b */ X86_EFL_PF,
370 /* 0xb3 = 10110011b */ 0,
371 /* 0xb4 = 10110100b */ X86_EFL_PF,
372 /* 0xb5 = 10110101b */ 0,
373 /* 0xb6 = 10110110b */ 0,
374 /* 0xb7 = 10110111b */ X86_EFL_PF,
375 /* 0xb8 = 10111000b */ X86_EFL_PF,
376 /* 0xb9 = 10111001b */ 0,
377 /* 0xba = 10111010b */ 0,
378 /* 0xbb = 10111011b */ X86_EFL_PF,
379 /* 0xbc = 10111100b */ 0,
380 /* 0xbd = 10111101b */ X86_EFL_PF,
381 /* 0xbe = 10111110b */ X86_EFL_PF,
382 /* 0xbf = 10111111b */ 0,
383 /* 0xc0 = 11000000b */ X86_EFL_PF,
384 /* 0xc1 = 11000001b */ 0,
385 /* 0xc2 = 11000010b */ 0,
386 /* 0xc3 = 11000011b */ X86_EFL_PF,
387 /* 0xc4 = 11000100b */ 0,
388 /* 0xc5 = 11000101b */ X86_EFL_PF,
389 /* 0xc6 = 11000110b */ X86_EFL_PF,
390 /* 0xc7 = 11000111b */ 0,
391 /* 0xc8 = 11001000b */ 0,
392 /* 0xc9 = 11001001b */ X86_EFL_PF,
393 /* 0xca = 11001010b */ X86_EFL_PF,
394 /* 0xcb = 11001011b */ 0,
395 /* 0xcc = 11001100b */ X86_EFL_PF,
396 /* 0xcd = 11001101b */ 0,
397 /* 0xce = 11001110b */ 0,
398 /* 0xcf = 11001111b */ X86_EFL_PF,
399 /* 0xd0 = 11010000b */ 0,
400 /* 0xd1 = 11010001b */ X86_EFL_PF,
401 /* 0xd2 = 11010010b */ X86_EFL_PF,
402 /* 0xd3 = 11010011b */ 0,
403 /* 0xd4 = 11010100b */ X86_EFL_PF,
404 /* 0xd5 = 11010101b */ 0,
405 /* 0xd6 = 11010110b */ 0,
406 /* 0xd7 = 11010111b */ X86_EFL_PF,
407 /* 0xd8 = 11011000b */ X86_EFL_PF,
408 /* 0xd9 = 11011001b */ 0,
409 /* 0xda = 11011010b */ 0,
410 /* 0xdb = 11011011b */ X86_EFL_PF,
411 /* 0xdc = 11011100b */ 0,
412 /* 0xdd = 11011101b */ X86_EFL_PF,
413 /* 0xde = 11011110b */ X86_EFL_PF,
414 /* 0xdf = 11011111b */ 0,
415 /* 0xe0 = 11100000b */ 0,
416 /* 0xe1 = 11100001b */ X86_EFL_PF,
417 /* 0xe2 = 11100010b */ X86_EFL_PF,
418 /* 0xe3 = 11100011b */ 0,
419 /* 0xe4 = 11100100b */ X86_EFL_PF,
420 /* 0xe5 = 11100101b */ 0,
421 /* 0xe6 = 11100110b */ 0,
422 /* 0xe7 = 11100111b */ X86_EFL_PF,
423 /* 0xe8 = 11101000b */ X86_EFL_PF,
424 /* 0xe9 = 11101001b */ 0,
425 /* 0xea = 11101010b */ 0,
426 /* 0xeb = 11101011b */ X86_EFL_PF,
427 /* 0xec = 11101100b */ 0,
428 /* 0xed = 11101101b */ X86_EFL_PF,
429 /* 0xee = 11101110b */ X86_EFL_PF,
430 /* 0xef = 11101111b */ 0,
431 /* 0xf0 = 11110000b */ X86_EFL_PF,
432 /* 0xf1 = 11110001b */ 0,
433 /* 0xf2 = 11110010b */ 0,
434 /* 0xf3 = 11110011b */ X86_EFL_PF,
435 /* 0xf4 = 11110100b */ 0,
436 /* 0xf5 = 11110101b */ X86_EFL_PF,
437 /* 0xf6 = 11110110b */ X86_EFL_PF,
438 /* 0xf7 = 11110111b */ 0,
439 /* 0xf8 = 11111000b */ 0,
440 /* 0xf9 = 11111001b */ X86_EFL_PF,
441 /* 0xfa = 11111010b */ X86_EFL_PF,
442 /* 0xfb = 11111011b */ 0,
443 /* 0xfc = 11111100b */ X86_EFL_PF,
444 /* 0xfd = 11111101b */ 0,
445 /* 0xfe = 11111110b */ 0,
446 /* 0xff = 11111111b */ X86_EFL_PF,
447};
448
449/* for clang: */
450extern const RTFLOAT80U g_ar80Zero[];
451extern const RTFLOAT80U g_ar80One[];
452extern const RTFLOAT80U g_r80Indefinite;
453extern const RTFLOAT80U g_ar80Infinity[];
454extern const RTFLOAT128U g_r128Ln2;
455extern const RTUINT128U g_u128Ln2Mantissa;
456extern const RTUINT128U g_u128Ln2MantissaIntel;
457extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
458
459/** Zero values (indexed by fSign). */
460RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
461
462/** One values (indexed by fSign). */
463RTFLOAT80U const g_ar80One[] =
464{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
465
466/** Indefinite (negative). */
467RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
468
469/** Infinities (indexed by fSign). */
470RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
471
472#if 0
473/** 128-bit floating point constant: 2.0 */
474const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
475#endif
476
477
478/* The next section is generated by tools/IEMGenFpuConstants: */
479
480/** The ln2 constant as 128-bit floating point value.
481 * base-10: 6.93147180559945309417232121458176575e-1
482 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
483 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
484//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
485const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
486/** High precision ln2 value.
487 * base-10: 6.931471805599453094172321214581765680747e-1
488 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
489 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
490const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
491/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
492 * base-10: 6.931471805599453094151379470289064954613e-1
493 * base-16: b.17217f7d1cf79abc0000000000000000@-1
494 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
495const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
496
497/** Horner constants for f2xm1 */
498const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
499{
500 /* a0
501 * base-10: 1.00000000000000000000000000000000000e0
502 * base-16: 1.0000000000000000000000000000@0
503 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
504 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
505 /* a1
506 * base-10: 5.00000000000000000000000000000000000e-1
507 * base-16: 8.0000000000000000000000000000@-1
508 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
509 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
510 /* a2
511 * base-10: 1.66666666666666666666666666666666658e-1
512 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
513 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
514 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
515 /* a3
516 * base-10: 4.16666666666666666666666666666666646e-2
517 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
518 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
519 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
520 /* a4
521 * base-10: 8.33333333333333333333333333333333323e-3
522 * base-16: 2.2222222222222222222222222222@-2
523 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
524 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
525 /* a5
526 * base-10: 1.38888888888888888888888888888888874e-3
527 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
528 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
529 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
530 /* a6
531 * base-10: 1.98412698412698412698412698412698412e-4
532 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
533 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
534 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
535 /* a7
536 * base-10: 2.48015873015873015873015873015873015e-5
537 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
538 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
539 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
540 /* a8
541 * base-10: 2.75573192239858906525573192239858902e-6
542 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
543 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
544 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
545 /* a9
546 * base-10: 2.75573192239858906525573192239858865e-7
547 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
548 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
549 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
550 /* a10
551 * base-10: 2.50521083854417187750521083854417184e-8
552 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
553 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
554 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
555 /* a11
556 * base-10: 2.08767569878680989792100903212014296e-9
557 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
558 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
559 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
560 /* a12
561 * base-10: 1.60590438368216145993923771701549472e-10
562 * base-16: b.092309d43684be51c198e91d7b40@-9
563 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
564 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
565 /* a13
566 * base-10: 1.14707455977297247138516979786821043e-11
567 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
568 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
569 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
570 /* a14
571 * base-10: 7.64716373181981647590113198578806964e-13
572 * base-16: d.73f9f399dc0f88ec32b587746578@-11
573 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
574 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
575 /* a15
576 * base-10: 4.77947733238738529743820749111754352e-14
577 * base-16: d.73f9f399dc0f88ec32b587746578@-12
578 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
579 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
580 /* a16
581 * base-10: 2.81145725434552076319894558301031970e-15
582 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
583 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
584 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
585 /* a17
586 * base-10: 1.56192069685862264622163643500573321e-16
587 * base-16: b.413c31dcbecbbdd8024435161550@-14
588 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
589 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
590 /* a18
591 * base-10: 8.22063524662432971695598123687227980e-18
592 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
593 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
594 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
595 /* a19
596 * base-10: 4.11031762331216485847799061843614006e-19
597 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
598 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
599 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
600 /* a20
601 * base-10: 7.04351638180413298434020229233492164e-20
602 * base-16: 1.4c9ee35db1d1f3c946fdcd48fd88@-16
603 * base-2 : 1.0100110010011110111000110101110110110001110100011111001111001001010001101111110111001101010010001111110110001000e-64 */
604 RTFLOAT128U_INIT_C(0, 0x4c9ee35db1d1, 0xf3c946fdcd48fd88, 0x3fbf),
605 /* a21
606 * base-10: 5.81527769640186708776361513365257702e-20
607 * base-16: 1.129e64bff606a2b9c9fc624481cd@-16
608 * base-2 : 1.0001001010011110011001001011111111110110000001101010001010111001110010011111110001100010010001001000000111001101e-64 */
609 RTFLOAT128U_INIT_C(0, 0x129e64bff606, 0xa2b9c9fc624481cd, 0x3fbf),
610};
611
612
613/*
614 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
615 * it all in C is probably safer atm., optimize what's necessary later, maybe.
616 */
617#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
618
619
620/*********************************************************************************************************************************
621* Binary Operations *
622*********************************************************************************************************************************/
623
624/*
625 * ADD
626 */
627
628IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
629{
630 uint64_t uDst = *puDst;
631 uint64_t uResult = uDst + uSrc;
632 *puDst = uResult;
633 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
634}
635
636# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
637
638IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
639{
640 uint32_t uDst = *puDst;
641 uint32_t uResult = uDst + uSrc;
642 *puDst = uResult;
643 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
644}
645
646
647IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
648{
649 uint16_t uDst = *puDst;
650 uint16_t uResult = uDst + uSrc;
651 *puDst = uResult;
652 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
653}
654
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
657{
658 uint8_t uDst = *puDst;
659 uint8_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
662}
663
664# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
665
666/*
667 * ADC
668 */
669
670IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
671{
672 if (!(*pfEFlags & X86_EFL_CF))
673 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
674 else
675 {
676 uint64_t uDst = *puDst;
677 uint64_t uResult = uDst + uSrc + 1;
678 *puDst = uResult;
679 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
680 }
681}
682
683# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
684
685IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
686{
687 if (!(*pfEFlags & X86_EFL_CF))
688 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
689 else
690 {
691 uint32_t uDst = *puDst;
692 uint32_t uResult = uDst + uSrc + 1;
693 *puDst = uResult;
694 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
695 }
696}
697
698
699IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
700{
701 if (!(*pfEFlags & X86_EFL_CF))
702 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
703 else
704 {
705 uint16_t uDst = *puDst;
706 uint16_t uResult = uDst + uSrc + 1;
707 *puDst = uResult;
708 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
709 }
710}
711
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint8_t uDst = *puDst;
720 uint8_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
723 }
724}
725
726# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
727
728/*
729 * SUB
730 */
731
732IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
733{
734 uint64_t uDst = *puDst;
735 uint64_t uResult = uDst - uSrc;
736 *puDst = uResult;
737 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
738}
739
740# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
741
742IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
743{
744 uint32_t uDst = *puDst;
745 uint32_t uResult = uDst - uSrc;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
748}
749
750
751IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
752{
753 uint16_t uDst = *puDst;
754 uint16_t uResult = uDst - uSrc;
755 *puDst = uResult;
756 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
757}
758
759
760IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
761{
762 uint8_t uDst = *puDst;
763 uint8_t uResult = uDst - uSrc;
764 *puDst = uResult;
765 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
766}
767
768# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
769
770/*
771 * SBB
772 */
773
774IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
775{
776 if (!(*pfEFlags & X86_EFL_CF))
777 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
778 else
779 {
780 uint64_t uDst = *puDst;
781 uint64_t uResult = uDst - uSrc - 1;
782 *puDst = uResult;
783 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
784 }
785}
786
787# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
790{
791 if (!(*pfEFlags & X86_EFL_CF))
792 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
793 else
794 {
795 uint32_t uDst = *puDst;
796 uint32_t uResult = uDst - uSrc - 1;
797 *puDst = uResult;
798 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
799 }
800}
801
802
803IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
804{
805 if (!(*pfEFlags & X86_EFL_CF))
806 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
807 else
808 {
809 uint16_t uDst = *puDst;
810 uint16_t uResult = uDst - uSrc - 1;
811 *puDst = uResult;
812 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
813 }
814}
815
816
817IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
818{
819 if (!(*pfEFlags & X86_EFL_CF))
820 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
821 else
822 {
823 uint8_t uDst = *puDst;
824 uint8_t uResult = uDst - uSrc - 1;
825 *puDst = uResult;
826 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
827 }
828}
829
830# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
831
832
833/*
834 * OR
835 */
836
837IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
838{
839 uint64_t uResult = *puDst | uSrc;
840 *puDst = uResult;
841 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
842}
843
844# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
845
846IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
847{
848 uint32_t uResult = *puDst | uSrc;
849 *puDst = uResult;
850 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
851}
852
853
854IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
855{
856 uint16_t uResult = *puDst | uSrc;
857 *puDst = uResult;
858 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
859}
860
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
863{
864 uint8_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
867}
868
869# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
870
871/*
872 * XOR
873 */
874
875IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
876{
877 uint64_t uResult = *puDst ^ uSrc;
878 *puDst = uResult;
879 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
880}
881
882# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
885{
886 uint32_t uResult = *puDst ^ uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
893{
894 uint16_t uResult = *puDst ^ uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
897}
898
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
901{
902 uint8_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
905}
906
907# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
908
909/*
910 * AND
911 */
912
913IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
914{
915 uint64_t const uResult = *puDst & uSrc;
916 *puDst = uResult;
917 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
918}
919
920# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
923{
924 uint32_t const uResult = *puDst & uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
931{
932 uint16_t const uResult = *puDst & uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
935}
936
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
939{
940 uint8_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
943}
944
945# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
946#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
947
948/*
949 * ANDN (BMI1 instruction)
950 */
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
953{
954 uint64_t const uResult = ~uSrc1 & uSrc2;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
961{
962 uint32_t const uResult = ~uSrc1 & uSrc2;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
965}
966
967
968#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
969IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
970{
971 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
972}
973#endif
974
975
976#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
978{
979 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
980}
981#endif
982
983#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
984
985/*
986 * CMP
987 */
988
989IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
990{
991 uint64_t uDstTmp = *puDst;
992 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
993}
994
995# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
996
997IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
998{
999 uint32_t uDstTmp = *puDst;
1000 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1001}
1002
1003
1004IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1005{
1006 uint16_t uDstTmp = *puDst;
1007 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1008}
1009
1010
1011IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1012{
1013 uint8_t uDstTmp = *puDst;
1014 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1015}
1016
1017# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1018
1019/*
1020 * TEST
1021 */
1022
1023IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1024{
1025 uint64_t uResult = *puDst & uSrc;
1026 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1027}
1028
1029# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1030
1031IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1032{
1033 uint32_t uResult = *puDst & uSrc;
1034 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1035}
1036
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1039{
1040 uint16_t uResult = *puDst & uSrc;
1041 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1042}
1043
1044
1045IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1046{
1047 uint8_t uResult = *puDst & uSrc;
1048 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1049}
1050
1051# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1052
1053
1054/*
1055 * LOCK prefixed variants of the above
1056 */
1057
1058/** 64-bit locked binary operand operation. */
1059# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1060 do { \
1061 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1062 uint ## a_cBitsWidth ## _t uTmp; \
1063 uint32_t fEflTmp; \
1064 do \
1065 { \
1066 uTmp = uOld; \
1067 fEflTmp = *pfEFlags; \
1068 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1069 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1070 *pfEFlags = fEflTmp; \
1071 } while (0)
1072
1073
1074#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1075 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1076 uint ## a_cBitsWidth ## _t uSrc, \
1077 uint32_t *pfEFlags)) \
1078 { \
1079 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1080 }
1081
1082EMIT_LOCKED_BIN_OP(add, 64)
1083EMIT_LOCKED_BIN_OP(adc, 64)
1084EMIT_LOCKED_BIN_OP(sub, 64)
1085EMIT_LOCKED_BIN_OP(sbb, 64)
1086EMIT_LOCKED_BIN_OP(or, 64)
1087EMIT_LOCKED_BIN_OP(xor, 64)
1088EMIT_LOCKED_BIN_OP(and, 64)
1089# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1090EMIT_LOCKED_BIN_OP(add, 32)
1091EMIT_LOCKED_BIN_OP(adc, 32)
1092EMIT_LOCKED_BIN_OP(sub, 32)
1093EMIT_LOCKED_BIN_OP(sbb, 32)
1094EMIT_LOCKED_BIN_OP(or, 32)
1095EMIT_LOCKED_BIN_OP(xor, 32)
1096EMIT_LOCKED_BIN_OP(and, 32)
1097
1098EMIT_LOCKED_BIN_OP(add, 16)
1099EMIT_LOCKED_BIN_OP(adc, 16)
1100EMIT_LOCKED_BIN_OP(sub, 16)
1101EMIT_LOCKED_BIN_OP(sbb, 16)
1102EMIT_LOCKED_BIN_OP(or, 16)
1103EMIT_LOCKED_BIN_OP(xor, 16)
1104EMIT_LOCKED_BIN_OP(and, 16)
1105
1106EMIT_LOCKED_BIN_OP(add, 8)
1107EMIT_LOCKED_BIN_OP(adc, 8)
1108EMIT_LOCKED_BIN_OP(sub, 8)
1109EMIT_LOCKED_BIN_OP(sbb, 8)
1110EMIT_LOCKED_BIN_OP(or, 8)
1111EMIT_LOCKED_BIN_OP(xor, 8)
1112EMIT_LOCKED_BIN_OP(and, 8)
1113# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1114
1115
1116/*
1117 * Bit operations (same signature as above).
1118 */
1119
1120/*
1121 * BT
1122 */
1123
1124IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1125{
1126 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1127 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1128 Assert(uSrc < 64);
1129 uint64_t uDst = *puDst;
1130 if (uDst & RT_BIT_64(uSrc))
1131 *pfEFlags |= X86_EFL_CF;
1132 else
1133 *pfEFlags &= ~X86_EFL_CF;
1134}
1135
1136# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1137
1138IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1139{
1140 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1141 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1142 Assert(uSrc < 32);
1143 uint32_t uDst = *puDst;
1144 if (uDst & RT_BIT_32(uSrc))
1145 *pfEFlags |= X86_EFL_CF;
1146 else
1147 *pfEFlags &= ~X86_EFL_CF;
1148}
1149
1150IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1151{
1152 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1153 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1154 Assert(uSrc < 16);
1155 uint16_t uDst = *puDst;
1156 if (uDst & RT_BIT_32(uSrc))
1157 *pfEFlags |= X86_EFL_CF;
1158 else
1159 *pfEFlags &= ~X86_EFL_CF;
1160}
1161
1162# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1163
1164/*
1165 * BTC
1166 */
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 64);
1173 uint64_t fMask = RT_BIT_64(uSrc);
1174 uint64_t uDst = *puDst;
1175 if (uDst & fMask)
1176 {
1177 uDst &= ~fMask;
1178 *puDst = uDst;
1179 *pfEFlags |= X86_EFL_CF;
1180 }
1181 else
1182 {
1183 uDst |= fMask;
1184 *puDst = uDst;
1185 *pfEFlags &= ~X86_EFL_CF;
1186 }
1187}
1188
1189# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1190
1191IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1192{
1193 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1194 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1195 Assert(uSrc < 32);
1196 uint32_t fMask = RT_BIT_32(uSrc);
1197 uint32_t uDst = *puDst;
1198 if (uDst & fMask)
1199 {
1200 uDst &= ~fMask;
1201 *puDst = uDst;
1202 *pfEFlags |= X86_EFL_CF;
1203 }
1204 else
1205 {
1206 uDst |= fMask;
1207 *puDst = uDst;
1208 *pfEFlags &= ~X86_EFL_CF;
1209 }
1210}
1211
1212
1213IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1214{
1215 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1216 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1217 Assert(uSrc < 16);
1218 uint16_t fMask = RT_BIT_32(uSrc);
1219 uint16_t uDst = *puDst;
1220 if (uDst & fMask)
1221 {
1222 uDst &= ~fMask;
1223 *puDst = uDst;
1224 *pfEFlags |= X86_EFL_CF;
1225 }
1226 else
1227 {
1228 uDst |= fMask;
1229 *puDst = uDst;
1230 *pfEFlags &= ~X86_EFL_CF;
1231 }
1232}
1233
1234# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1235
1236/*
1237 * BTR
1238 */
1239
1240IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1241{
1242 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1243 logical operation (AND/OR/whatever). */
1244 Assert(uSrc < 64);
1245 uint64_t fMask = RT_BIT_64(uSrc);
1246 uint64_t uDst = *puDst;
1247 if (uDst & fMask)
1248 {
1249 uDst &= ~fMask;
1250 *puDst = uDst;
1251 *pfEFlags |= X86_EFL_CF;
1252 }
1253 else
1254 *pfEFlags &= ~X86_EFL_CF;
1255}
1256
1257# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1258
1259IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1260{
1261 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1262 logical operation (AND/OR/whatever). */
1263 Assert(uSrc < 32);
1264 uint32_t fMask = RT_BIT_32(uSrc);
1265 uint32_t uDst = *puDst;
1266 if (uDst & fMask)
1267 {
1268 uDst &= ~fMask;
1269 *puDst = uDst;
1270 *pfEFlags |= X86_EFL_CF;
1271 }
1272 else
1273 *pfEFlags &= ~X86_EFL_CF;
1274}
1275
1276
1277IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1278{
1279 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1280 logical operation (AND/OR/whatever). */
1281 Assert(uSrc < 16);
1282 uint16_t fMask = RT_BIT_32(uSrc);
1283 uint16_t uDst = *puDst;
1284 if (uDst & fMask)
1285 {
1286 uDst &= ~fMask;
1287 *puDst = uDst;
1288 *pfEFlags |= X86_EFL_CF;
1289 }
1290 else
1291 *pfEFlags &= ~X86_EFL_CF;
1292}
1293
1294# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1295
1296/*
1297 * BTS
1298 */
1299
1300IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1301{
1302 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1303 logical operation (AND/OR/whatever). */
1304 Assert(uSrc < 64);
1305 uint64_t fMask = RT_BIT_64(uSrc);
1306 uint64_t uDst = *puDst;
1307 if (uDst & fMask)
1308 *pfEFlags |= X86_EFL_CF;
1309 else
1310 {
1311 uDst |= fMask;
1312 *puDst = uDst;
1313 *pfEFlags &= ~X86_EFL_CF;
1314 }
1315}
1316
1317# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1318
1319IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1320{
1321 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1322 logical operation (AND/OR/whatever). */
1323 Assert(uSrc < 32);
1324 uint32_t fMask = RT_BIT_32(uSrc);
1325 uint32_t uDst = *puDst;
1326 if (uDst & fMask)
1327 *pfEFlags |= X86_EFL_CF;
1328 else
1329 {
1330 uDst |= fMask;
1331 *puDst = uDst;
1332 *pfEFlags &= ~X86_EFL_CF;
1333 }
1334}
1335
1336
1337IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1338{
1339 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1340 logical operation (AND/OR/whatever). */
1341 Assert(uSrc < 16);
1342 uint16_t fMask = RT_BIT_32(uSrc);
1343 uint32_t uDst = *puDst;
1344 if (uDst & fMask)
1345 *pfEFlags |= X86_EFL_CF;
1346 else
1347 {
1348 uDst |= fMask;
1349 *puDst = uDst;
1350 *pfEFlags &= ~X86_EFL_CF;
1351 }
1352}
1353
1354# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1355
1356
1357EMIT_LOCKED_BIN_OP(btc, 64)
1358EMIT_LOCKED_BIN_OP(btr, 64)
1359EMIT_LOCKED_BIN_OP(bts, 64)
1360# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1361EMIT_LOCKED_BIN_OP(btc, 32)
1362EMIT_LOCKED_BIN_OP(btr, 32)
1363EMIT_LOCKED_BIN_OP(bts, 32)
1364
1365EMIT_LOCKED_BIN_OP(btc, 16)
1366EMIT_LOCKED_BIN_OP(btr, 16)
1367EMIT_LOCKED_BIN_OP(bts, 16)
1368# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1369
1370
1371/*
1372 * Helpers for BSR and BSF.
1373 *
1374 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1375 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1376 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1377 * but we restrict ourselves to emulating these recent marchs.
1378 */
1379#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1380 unsigned iBit = (a_iBit); \
1381 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1382 if (iBit) \
1383 { \
1384 *puDst = --iBit; \
1385 fEfl |= g_afParity[iBit]; \
1386 } \
1387 else \
1388 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1389 *pfEFlags = fEfl; \
1390 } while (0)
1391#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1392 unsigned const iBit = (a_iBit); \
1393 if (iBit) \
1394 { \
1395 *puDst = iBit - 1; \
1396 *pfEFlags &= ~X86_EFL_ZF; \
1397 } \
1398 else \
1399 *pfEFlags |= X86_EFL_ZF; \
1400 } while (0)
1401
1402
1403/*
1404 * BSF - first (least significant) bit set
1405 */
1406IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1407{
1408 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1409}
1410
1411IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1412{
1413 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1414}
1415
1416IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1417{
1418 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1419}
1420
1421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1422
1423IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1424{
1425 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1426}
1427
1428IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1429{
1430 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1431}
1432
1433IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1434{
1435 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1436}
1437
1438
1439IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1440{
1441 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1442}
1443
1444IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1445{
1446 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1447}
1448
1449IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1450{
1451 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1452}
1453
1454# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1455
1456
1457/*
1458 * BSR - last (most significant) bit set
1459 */
1460IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1461{
1462 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1463}
1464
1465IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1466{
1467 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1468}
1469
1470IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1471{
1472 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1473}
1474
1475# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1476
1477IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1478{
1479 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1480}
1481
1482IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1483{
1484 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1485}
1486
1487IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1488{
1489 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1490}
1491
1492
1493IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1494{
1495 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1496}
1497
1498IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1499{
1500 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1501}
1502
1503IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1504{
1505 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1506}
1507
1508# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1509
1510
1511/*
1512 * Helpers for LZCNT and TZCNT.
1513 */
1514#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1515 unsigned const uResult = (a_uResult); \
1516 *(a_puDst) = uResult; \
1517 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1518 if (uResult) \
1519 fEfl |= g_afParity[uResult]; \
1520 else \
1521 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1522 if (!a_uSrc) \
1523 fEfl |= X86_EFL_CF; \
1524 *(a_pfEFlags) = fEfl; \
1525 } while (0)
1526#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1527 unsigned const uResult = (a_uResult); \
1528 *(a_puDst) = uResult; \
1529 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1530 if (!uResult) \
1531 fEfl |= X86_EFL_ZF; \
1532 if (!a_uSrc) \
1533 fEfl |= X86_EFL_CF; \
1534 *(a_pfEFlags) = fEfl; \
1535 } while (0)
1536
1537
1538/*
1539 * LZCNT - count leading zero bits.
1540 */
1541IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1542{
1543 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1544}
1545
1546IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1547{
1548 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1549}
1550
1551IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1552{
1553 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1554}
1555
1556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1557
1558IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1559{
1560 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1561}
1562
1563IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1564{
1565 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1566}
1567
1568IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1569{
1570 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1571}
1572
1573
1574IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1575{
1576 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1577}
1578
1579IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1580{
1581 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1582}
1583
1584IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1585{
1586 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1587}
1588
1589# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1590
1591
1592/*
1593 * TZCNT - count leading zero bits.
1594 */
1595IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1596{
1597 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1598}
1599
1600IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1601{
1602 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1603}
1604
1605IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1606{
1607 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1608}
1609
1610# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1611
1612IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1613{
1614 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1615}
1616
1617IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1618{
1619 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1620}
1621
1622IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1623{
1624 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1625}
1626
1627
1628IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1629{
1630 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1631}
1632
1633IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1634{
1635 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1636}
1637
1638IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1639{
1640 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1641}
1642
1643# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1644#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1645
1646/*
1647 * BEXTR (BMI1 instruction)
1648 */
1649#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1650IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1651 a_Type uSrc2, uint32_t *pfEFlags)) \
1652{ \
1653 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1654 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1655 a_Type uResult; \
1656 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1657 if (iFirstBit < a_cBits) \
1658 { \
1659 uResult = uSrc1 >> iFirstBit; \
1660 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1661 if (cBits < a_cBits) \
1662 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1663 *puDst = uResult; \
1664 if (!uResult) \
1665 fEfl |= X86_EFL_ZF; \
1666 } \
1667 else \
1668 { \
1669 *puDst = uResult = 0; \
1670 fEfl |= X86_EFL_ZF; \
1671 } \
1672 /** @todo complete flag calculations. */ \
1673 *pfEFlags = fEfl; \
1674}
1675
1676EMIT_BEXTR(64, uint64_t, _fallback)
1677EMIT_BEXTR(32, uint32_t, _fallback)
1678#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1679EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1680#endif
1681#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1682EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1683#endif
1684
1685/*
1686 * BLSR (BMI1 instruction)
1687 */
1688#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1689IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1690{ \
1691 uint32_t fEfl1 = *pfEFlags; \
1692 uint32_t fEfl2 = fEfl1; \
1693 *puDst = uSrc; \
1694 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1695 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1696 \
1697 /* AMD: The carry flag is from the SUB operation. */ \
1698 /* 10890xe: PF always cleared? */ \
1699 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1700 fEfl2 |= fEfl1 & X86_EFL_CF; \
1701 *pfEFlags = fEfl2; \
1702}
1703
1704EMIT_BLSR(64, uint64_t, _fallback)
1705EMIT_BLSR(32, uint32_t, _fallback)
1706#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BLSR(64, uint64_t, RT_NOTHING)
1708#endif
1709#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1710EMIT_BLSR(32, uint32_t, RT_NOTHING)
1711#endif
1712
1713/*
1714 * BLSMSK (BMI1 instruction)
1715 */
1716#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1717IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1718{ \
1719 uint32_t fEfl1 = *pfEFlags; \
1720 uint32_t fEfl2 = fEfl1; \
1721 *puDst = uSrc; \
1722 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1723 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1724 \
1725 /* AMD: The carry flag is from the SUB operation. */ \
1726 /* 10890xe: PF always cleared? */ \
1727 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1728 fEfl2 |= fEfl1 & X86_EFL_CF; \
1729 *pfEFlags = fEfl2; \
1730}
1731
1732EMIT_BLSMSK(64, uint64_t, _fallback)
1733EMIT_BLSMSK(32, uint32_t, _fallback)
1734#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1736#endif
1737#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1738EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1739#endif
1740
1741/*
1742 * BLSI (BMI1 instruction)
1743 */
1744#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1745IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1746{ \
1747 uint32_t fEfl1 = *pfEFlags; \
1748 uint32_t fEfl2 = fEfl1; \
1749 *puDst = uSrc; \
1750 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1751 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1752 \
1753 /* AMD: The carry flag is from the SUB operation. */ \
1754 /* 10890xe: PF always cleared? */ \
1755 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1756 fEfl2 |= fEfl1 & X86_EFL_CF; \
1757 *pfEFlags = fEfl2; \
1758}
1759
1760EMIT_BLSI(64, uint64_t, _fallback)
1761EMIT_BLSI(32, uint32_t, _fallback)
1762#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSI(64, uint64_t, RT_NOTHING)
1764#endif
1765#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1766EMIT_BLSI(32, uint32_t, RT_NOTHING)
1767#endif
1768
1769/*
1770 * BZHI (BMI2 instruction)
1771 */
1772#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1773IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1774 a_Type uSrc2, uint32_t *pfEFlags)) \
1775{ \
1776 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1777 a_Type uResult; \
1778 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1779 if (iFirstBit < a_cBits) \
1780 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1781 else \
1782 { \
1783 uResult = uSrc1; \
1784 fEfl |= X86_EFL_CF; \
1785 } \
1786 *puDst = uResult; \
1787 fEfl |= X86_EFL_CALC_ZF(uResult); \
1788 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1789 *pfEFlags = fEfl; \
1790}
1791
1792EMIT_BZHI(64, uint64_t, _fallback)
1793EMIT_BZHI(32, uint32_t, _fallback)
1794#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1795EMIT_BZHI(64, uint64_t, RT_NOTHING)
1796#endif
1797#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1798EMIT_BZHI(32, uint32_t, RT_NOTHING)
1799#endif
1800
1801/*
1802 * POPCNT
1803 */
1804RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1805{
1806 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1807 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1808 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1809 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1810};
1811
1812/** @todo Use native popcount where possible and employ some more efficient
1813 * algorithm here (or in asm.h fallback)! */
1814
1815DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1816{
1817 return g_abBitCounts6[ u16 & 0x3f]
1818 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1819 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1820}
1821
1822DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1823{
1824 return g_abBitCounts6[ u32 & 0x3f]
1825 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1826 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1827 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1828 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1829 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1830}
1831
1832DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1833{
1834 return g_abBitCounts6[ u64 & 0x3f]
1835 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1836 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1837 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1838 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1839 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1840 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1841 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1842 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1843 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1844 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1845}
1846
1847#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1848IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1849{ \
1850 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1851 a_Type uResult; \
1852 if (uSrc) \
1853 uResult = iemPopCountU ## a_cBits(uSrc); \
1854 else \
1855 { \
1856 fEfl |= X86_EFL_ZF; \
1857 uResult = 0; \
1858 } \
1859 *puDst = uResult; \
1860 *pfEFlags = fEfl; \
1861}
1862
1863EMIT_POPCNT(64, uint64_t, _fallback)
1864EMIT_POPCNT(32, uint32_t, _fallback)
1865EMIT_POPCNT(16, uint16_t, _fallback)
1866#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1867EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1868#endif
1869#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1870EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1871EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1872#endif
1873
1874
1875#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1876
1877/*
1878 * XCHG
1879 */
1880
1881IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1882{
1883#if ARCH_BITS >= 64
1884 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1885#else
1886 uint64_t uOldMem = *puMem;
1887 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1888 ASMNopPause();
1889 *puReg = uOldMem;
1890#endif
1891}
1892
1893# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1894
1895IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1896{
1897 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1898}
1899
1900
1901IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1902{
1903 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1904}
1905
1906
1907IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1908{
1909 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1910}
1911
1912# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1913
1914
1915/* Unlocked variants for fDisregardLock mode: */
1916
1917IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1918{
1919 uint64_t const uOld = *puMem;
1920 *puMem = *puReg;
1921 *puReg = uOld;
1922}
1923
1924# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1927{
1928 uint32_t const uOld = *puMem;
1929 *puMem = *puReg;
1930 *puReg = uOld;
1931}
1932
1933
1934IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1935{
1936 uint16_t const uOld = *puMem;
1937 *puMem = *puReg;
1938 *puReg = uOld;
1939}
1940
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1943{
1944 uint8_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1950
1951
1952/*
1953 * XADD and LOCK XADD.
1954 */
1955#define EMIT_XADD(a_cBitsWidth, a_Type) \
1956IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1957{ \
1958 a_Type uDst = *puDst; \
1959 a_Type uResult = uDst; \
1960 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1961 *puDst = uResult; \
1962 *puReg = uDst; \
1963} \
1964\
1965IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1966{ \
1967 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1968 a_Type uResult; \
1969 uint32_t fEflTmp; \
1970 do \
1971 { \
1972 uResult = uOld; \
1973 fEflTmp = *pfEFlags; \
1974 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
1975 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
1976 *puReg = uOld; \
1977 *pfEFlags = fEflTmp; \
1978}
1979EMIT_XADD(64, uint64_t)
1980# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1981EMIT_XADD(32, uint32_t)
1982EMIT_XADD(16, uint16_t)
1983EMIT_XADD(8, uint8_t)
1984# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1985
1986#endif
1987
1988/*
1989 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
1990 *
1991 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
1992 * instructions are emulated as locked.
1993 */
1994#if defined(IEM_WITHOUT_ASSEMBLY)
1995
1996IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
1997{
1998 uint8_t uOld = *puAl;
1999 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2000 Assert(*puAl == uOld);
2001 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2002}
2003
2004
2005IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2006{
2007 uint16_t uOld = *puAx;
2008 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2009 Assert(*puAx == uOld);
2010 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2011}
2012
2013
2014IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2015{
2016 uint32_t uOld = *puEax;
2017 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2018 Assert(*puEax == uOld);
2019 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2020}
2021
2022
2023# if ARCH_BITS == 32
2024IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2025# else
2026IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2027# endif
2028{
2029# if ARCH_BITS == 32
2030 uint64_t const uSrcReg = *puSrcReg;
2031# endif
2032 uint64_t uOld = *puRax;
2033 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2034 Assert(*puRax == uOld);
2035 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2040 uint32_t *pEFlags))
2041{
2042 uint64_t const uNew = pu64EbxEcx->u;
2043 uint64_t const uOld = pu64EaxEdx->u;
2044 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2045 {
2046 Assert(pu64EaxEdx->u == uOld);
2047 *pEFlags |= X86_EFL_ZF;
2048 }
2049 else
2050 *pEFlags &= ~X86_EFL_ZF;
2051}
2052
2053
2054# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2055IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2056 uint32_t *pEFlags))
2057{
2058# ifdef VBOX_STRICT
2059 RTUINT128U const uOld = *pu128RaxRdx;
2060# endif
2061# if defined(RT_ARCH_AMD64)
2062 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2063 &pu128RaxRdx->u))
2064# else
2065 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2066# endif
2067 {
2068 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2069 *pEFlags |= X86_EFL_ZF;
2070 }
2071 else
2072 *pEFlags &= ~X86_EFL_ZF;
2073}
2074# endif
2075
2076#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2077
2078# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2079IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2080 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2081{
2082 RTUINT128U u128Tmp = *pu128Dst;
2083 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2084 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2085 {
2086 *pu128Dst = *pu128RbxRcx;
2087 *pEFlags |= X86_EFL_ZF;
2088 }
2089 else
2090 {
2091 *pu128RaxRdx = u128Tmp;
2092 *pEFlags &= ~X86_EFL_ZF;
2093 }
2094}
2095#endif /* !RT_ARCH_ARM64 */
2096
2097#if defined(IEM_WITHOUT_ASSEMBLY)
2098
2099/* Unlocked versions mapped to the locked ones: */
2100
2101IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2102{
2103 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2104}
2105
2106
2107IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2108{
2109 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2110}
2111
2112
2113IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2114{
2115 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2116}
2117
2118
2119# if ARCH_BITS == 32
2120IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2121{
2122 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2123}
2124# else
2125IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2126{
2127 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2128}
2129# endif
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2139 uint32_t *pEFlags))
2140{
2141 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2142}
2143
2144#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2145
2146#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2147 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2148
2149/*
2150 * MUL, IMUL, DIV and IDIV helpers.
2151 *
2152 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2153 * division step so we can select between using C operators and
2154 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2155 *
2156 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2157 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2158 * input loads and the result storing.
2159 */
2160
2161DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2162{
2163# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2164 pQuotient->s.Lo = 0;
2165 pQuotient->s.Hi = 0;
2166# endif
2167 RTUINT128U Divisor;
2168 Divisor.s.Lo = u64Divisor;
2169 Divisor.s.Hi = 0;
2170 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2171}
2172
2173# define DIV_LOAD(a_Dividend) \
2174 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2175# define DIV_LOAD_U8(a_Dividend) \
2176 a_Dividend.u = *puAX
2177
2178# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2179# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2180
2181# define MUL_LOAD_F1() *puA
2182# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2183
2184# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2185# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2186
2187# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2188 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2189# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2190 RTUInt128AssignNeg(&(a_Value))
2191
2192# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2193 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2194# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2195 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2196
2197# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2198 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2199 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2200# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2201 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2202
2203
2204/*
2205 * MUL
2206 */
2207# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2208IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2209{ \
2210 RTUINT ## a_cBitsWidth2x ## U Result; \
2211 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2212 a_fnStore(Result); \
2213 \
2214 /* Calc EFLAGS: */ \
2215 uint32_t fEfl = *pfEFlags; \
2216 if (a_fIntelFlags) \
2217 { /* Intel: 6700K and 10980XE behavior */ \
2218 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2219 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2220 fEfl |= X86_EFL_SF; \
2221 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2222 if (Result.s.Hi != 0) \
2223 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2224 } \
2225 else \
2226 { /* AMD: 3990X */ \
2227 if (Result.s.Hi != 0) \
2228 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2229 else \
2230 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2231 } \
2232 *pfEFlags = fEfl; \
2233 return 0; \
2234} \
2235
2236# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2237 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2238 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2239 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2240
2241# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2242EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2243 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2244# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2245EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2246 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2247EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2248 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2249EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2250 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2251# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2252# endif /* !DOXYGEN_RUNNING */
2253
2254/*
2255 * MULX
2256 */
2257# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2258IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2259 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2260{ \
2261 RTUINT ## a_cBitsWidth2x ## U Result; \
2262 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2263 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2264 *puDst1 = Result.s.Hi; \
2265} \
2266
2267# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2268EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2269EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2270# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2271EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2272EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2273# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2274# endif /* !DOXYGEN_RUNNING */
2275
2276
2277/*
2278 * IMUL
2279 *
2280 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2281 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2282 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2283 */
2284# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2285 a_Suffix, a_fIntelFlags) \
2286IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2287{ \
2288 RTUINT ## a_cBitsWidth2x ## U Result; \
2289 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2290 \
2291 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2292 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2293 { \
2294 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2295 { \
2296 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2297 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2298 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2299 } \
2300 else \
2301 { \
2302 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2303 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2304 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2305 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2306 a_fnNeg(Result, a_cBitsWidth2x); \
2307 } \
2308 } \
2309 else \
2310 { \
2311 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2312 { \
2313 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2314 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2315 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2316 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2317 a_fnNeg(Result, a_cBitsWidth2x); \
2318 } \
2319 else \
2320 { \
2321 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2322 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2323 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2324 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2325 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2326 } \
2327 } \
2328 a_fnStore(Result); \
2329 \
2330 if (a_fIntelFlags) \
2331 { \
2332 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2333 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2334 fEfl |= X86_EFL_SF; \
2335 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2336 } \
2337 *pfEFlags = fEfl; \
2338 return 0; \
2339}
2340# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2341 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2342 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2343 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2344
2345# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2346EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2347 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2348# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2349EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2350 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2351EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2352 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2353EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2354 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2355# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2356# endif /* !DOXYGEN_RUNNING */
2357
2358
2359/*
2360 * IMUL with two operands are mapped onto the three operand variant, ignoring
2361 * the high part of the product.
2362 */
2363# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2364IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2365{ \
2366 a_uType uIgn; \
2367 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2368} \
2369\
2370IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2371{ \
2372 a_uType uIgn; \
2373 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2374} \
2375\
2376IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2377{ \
2378 a_uType uIgn; \
2379 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2380}
2381
2382EMIT_IMUL_TWO(64, uint64_t)
2383# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2384EMIT_IMUL_TWO(32, uint32_t)
2385EMIT_IMUL_TWO(16, uint16_t)
2386# endif
2387
2388
2389/*
2390 * DIV
2391 */
2392# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2393 a_Suffix, a_fIntelFlags) \
2394IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2395{ \
2396 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2397 a_fnLoad(Dividend); \
2398 if ( uDivisor != 0 \
2399 && Dividend.s.Hi < uDivisor) \
2400 { \
2401 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2402 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2403 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2404 \
2405 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2406 if (!a_fIntelFlags) \
2407 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2408 return 0; \
2409 } \
2410 /* #DE */ \
2411 return -1; \
2412}
2413# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2414 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2415 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2416 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2417
2418# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2419EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2420 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2422EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2423 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2424EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2425 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2426EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2427 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2428# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2429# endif /* !DOXYGEN_RUNNING */
2430
2431
2432/*
2433 * IDIV
2434 *
2435 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2436 * set AF and clear PF, ZF and SF just like it does for DIV.
2437 *
2438 */
2439# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2440 a_Suffix, a_fIntelFlags) \
2441IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2442{ \
2443 /* Note! Skylake leaves all flags alone. */ \
2444 \
2445 /** @todo overflow checks */ \
2446 if (uDivisor != 0) \
2447 { \
2448 /* \
2449 * Convert to unsigned division. \
2450 */ \
2451 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2452 a_fnLoad(Dividend); \
2453 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2454 if (fSignedDividend) \
2455 a_fnNeg(Dividend, a_cBitsWidth2x); \
2456 \
2457 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2458 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2459 uDivisorPositive = uDivisor; \
2460 else \
2461 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2462 \
2463 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2464 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2465 \
2466 /* \
2467 * Setup the result, checking for overflows. \
2468 */ \
2469 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2470 { \
2471 if (!fSignedDividend) \
2472 { \
2473 /* Positive divisor, positive dividend => result positive. */ \
2474 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2475 { \
2476 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2477 if (!a_fIntelFlags) \
2478 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2479 return 0; \
2480 } \
2481 } \
2482 else \
2483 { \
2484 /* Positive divisor, negative dividend => result negative. */ \
2485 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2486 { \
2487 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2488 if (!a_fIntelFlags) \
2489 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2490 return 0; \
2491 } \
2492 } \
2493 } \
2494 else \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2500 { \
2501 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2511 { \
2512 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 } \
2520 /* #DE */ \
2521 return -1; \
2522}
2523# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2524 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2525 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2526 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2527
2528# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2529EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2530 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2531# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2532EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2533 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2534EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2535 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2536EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2537 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2539# endif /* !DOXYGEN_RUNNING */
2540
2541#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2542
2543
2544/*********************************************************************************************************************************
2545* Unary operations. *
2546*********************************************************************************************************************************/
2547#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2548
2549/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2550 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2551 *
2552 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2553 * borrowing in arithmetic loops on intel 8008).
2554 *
2555 * @returns Status bits.
2556 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2557 * @param a_uResult Unsigned result value.
2558 * @param a_uDst The original destination value (for AF calc).
2559 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2560 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2561 */
2562#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2563 do { \
2564 uint32_t fEflTmp = *(a_pfEFlags); \
2565 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2566 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2567 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2568 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2569 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2570 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2571 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2572 *(a_pfEFlags) = fEflTmp; \
2573 } while (0)
2574
2575/*
2576 * INC
2577 */
2578
2579IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2580{
2581 uint64_t uDst = *puDst;
2582 uint64_t uResult = uDst + 1;
2583 *puDst = uResult;
2584 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2585}
2586
2587# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2588
2589IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2590{
2591 uint32_t uDst = *puDst;
2592 uint32_t uResult = uDst + 1;
2593 *puDst = uResult;
2594 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2595}
2596
2597
2598IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2599{
2600 uint16_t uDst = *puDst;
2601 uint16_t uResult = uDst + 1;
2602 *puDst = uResult;
2603 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2604}
2605
2606IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2607{
2608 uint8_t uDst = *puDst;
2609 uint8_t uResult = uDst + 1;
2610 *puDst = uResult;
2611 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2612}
2613
2614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2615
2616
2617/*
2618 * DEC
2619 */
2620
2621IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2622{
2623 uint64_t uDst = *puDst;
2624 uint64_t uResult = uDst - 1;
2625 *puDst = uResult;
2626 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2627}
2628
2629# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint32_t uDst = *puDst;
2634 uint32_t uResult = uDst - 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2637}
2638
2639
2640IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2641{
2642 uint16_t uDst = *puDst;
2643 uint16_t uResult = uDst - 1;
2644 *puDst = uResult;
2645 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2646}
2647
2648
2649IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2650{
2651 uint8_t uDst = *puDst;
2652 uint8_t uResult = uDst - 1;
2653 *puDst = uResult;
2654 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2655}
2656
2657# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2658
2659
2660/*
2661 * NOT
2662 */
2663
2664IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2665{
2666 uint64_t uDst = *puDst;
2667 uint64_t uResult = ~uDst;
2668 *puDst = uResult;
2669 /* EFLAGS are not modified. */
2670 RT_NOREF_PV(pfEFlags);
2671}
2672
2673# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2674
2675IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2676{
2677 uint32_t uDst = *puDst;
2678 uint32_t uResult = ~uDst;
2679 *puDst = uResult;
2680 /* EFLAGS are not modified. */
2681 RT_NOREF_PV(pfEFlags);
2682}
2683
2684IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2685{
2686 uint16_t uDst = *puDst;
2687 uint16_t uResult = ~uDst;
2688 *puDst = uResult;
2689 /* EFLAGS are not modified. */
2690 RT_NOREF_PV(pfEFlags);
2691}
2692
2693IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2694{
2695 uint8_t uDst = *puDst;
2696 uint8_t uResult = ~uDst;
2697 *puDst = uResult;
2698 /* EFLAGS are not modified. */
2699 RT_NOREF_PV(pfEFlags);
2700}
2701
2702# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2703
2704
2705/*
2706 * NEG
2707 */
2708
2709/**
2710 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2711 *
2712 * @returns Status bits.
2713 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2714 * @param a_uResult Unsigned result value.
2715 * @param a_uDst The original destination value (for AF calc).
2716 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2717 */
2718#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2719 do { \
2720 uint32_t fEflTmp = *(a_pfEFlags); \
2721 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2722 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2723 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2724 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2725 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2726 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2727 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2728 *(a_pfEFlags) = fEflTmp; \
2729 } while (0)
2730
2731IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2732{
2733 uint64_t uDst = *puDst;
2734 uint64_t uResult = (uint64_t)0 - uDst;
2735 *puDst = uResult;
2736 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2737}
2738
2739# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2740
2741IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2742{
2743 uint32_t uDst = *puDst;
2744 uint32_t uResult = (uint32_t)0 - uDst;
2745 *puDst = uResult;
2746 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2747}
2748
2749
2750IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2751{
2752 uint16_t uDst = *puDst;
2753 uint16_t uResult = (uint16_t)0 - uDst;
2754 *puDst = uResult;
2755 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2756}
2757
2758
2759IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2760{
2761 uint8_t uDst = *puDst;
2762 uint8_t uResult = (uint8_t)0 - uDst;
2763 *puDst = uResult;
2764 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2765}
2766
2767# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2768
2769/*
2770 * Locked variants.
2771 */
2772
2773/** Emit a function for doing a locked unary operand operation. */
2774# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2775 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2776 uint32_t *pfEFlags)) \
2777 { \
2778 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2779 uint ## a_cBitsWidth ## _t uTmp; \
2780 uint32_t fEflTmp; \
2781 do \
2782 { \
2783 uTmp = uOld; \
2784 fEflTmp = *pfEFlags; \
2785 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2786 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2787 *pfEFlags = fEflTmp; \
2788 }
2789
2790EMIT_LOCKED_UNARY_OP(inc, 64)
2791EMIT_LOCKED_UNARY_OP(dec, 64)
2792EMIT_LOCKED_UNARY_OP(not, 64)
2793EMIT_LOCKED_UNARY_OP(neg, 64)
2794# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2795EMIT_LOCKED_UNARY_OP(inc, 32)
2796EMIT_LOCKED_UNARY_OP(dec, 32)
2797EMIT_LOCKED_UNARY_OP(not, 32)
2798EMIT_LOCKED_UNARY_OP(neg, 32)
2799
2800EMIT_LOCKED_UNARY_OP(inc, 16)
2801EMIT_LOCKED_UNARY_OP(dec, 16)
2802EMIT_LOCKED_UNARY_OP(not, 16)
2803EMIT_LOCKED_UNARY_OP(neg, 16)
2804
2805EMIT_LOCKED_UNARY_OP(inc, 8)
2806EMIT_LOCKED_UNARY_OP(dec, 8)
2807EMIT_LOCKED_UNARY_OP(not, 8)
2808EMIT_LOCKED_UNARY_OP(neg, 8)
2809# endif
2810
2811#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2812
2813
2814/*********************************************************************************************************************************
2815* Shifting and Rotating *
2816*********************************************************************************************************************************/
2817
2818/*
2819 * ROL
2820 */
2821#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2822IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2823{ \
2824 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2825 if (cShift) \
2826 { \
2827 if (a_cBitsWidth < 32) \
2828 cShift &= a_cBitsWidth - 1; \
2829 a_uType const uDst = *puDst; \
2830 a_uType const uResult = a_fnHlp(uDst, cShift); \
2831 *puDst = uResult; \
2832 \
2833 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2834 it the same way as for 1 bit shifts. */ \
2835 AssertCompile(X86_EFL_CF_BIT == 0); \
2836 uint32_t fEfl = *pfEFlags; \
2837 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2838 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2839 fEfl |= fCarry; \
2840 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2841 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2842 else /* Intel 10980XE: According to the first sub-shift: */ \
2843 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2844 *pfEFlags = fEfl; \
2845 } \
2846}
2847
2848#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2849EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2850#endif
2851EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2852EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2853
2854#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2855EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2856#endif
2857EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2858EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2859
2860DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2861{
2862 return (uValue << cShift) | (uValue >> (16 - cShift));
2863}
2864#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2865EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2866#endif
2867EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2868EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2869
2870DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2871{
2872 return (uValue << cShift) | (uValue >> (8 - cShift));
2873}
2874#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2875EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2876#endif
2877EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2878EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2879
2880
2881/*
2882 * ROR
2883 */
2884#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2885IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2886{ \
2887 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2888 if (cShift) \
2889 { \
2890 if (a_cBitsWidth < 32) \
2891 cShift &= a_cBitsWidth - 1; \
2892 a_uType const uDst = *puDst; \
2893 a_uType const uResult = a_fnHlp(uDst, cShift); \
2894 *puDst = uResult; \
2895 \
2896 /* Calc EFLAGS: */ \
2897 AssertCompile(X86_EFL_CF_BIT == 0); \
2898 uint32_t fEfl = *pfEFlags; \
2899 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2900 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2901 fEfl |= fCarry; \
2902 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2903 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2904 else /* Intel 10980XE: According to the first sub-shift: */ \
2905 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2906 *pfEFlags = fEfl; \
2907 } \
2908}
2909
2910#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2911EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2912#endif
2913EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2914EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2915
2916#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2917EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2918#endif
2919EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2920EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2921
2922DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2923{
2924 return (uValue >> cShift) | (uValue << (16 - cShift));
2925}
2926#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2927EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2928#endif
2929EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2930EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2931
2932DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2933{
2934 return (uValue >> cShift) | (uValue << (8 - cShift));
2935}
2936#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2937EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2938#endif
2939EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2940EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2941
2942
2943/*
2944 * RCL
2945 */
2946#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2947IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2948{ \
2949 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2950 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2951 cShift %= a_cBitsWidth + 1; \
2952 if (cShift) \
2953 { \
2954 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2955 cShift %= a_cBitsWidth + 1; \
2956 a_uType const uDst = *puDst; \
2957 a_uType uResult = uDst << cShift; \
2958 if (cShift > 1) \
2959 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2960 \
2961 AssertCompile(X86_EFL_CF_BIT == 0); \
2962 uint32_t fEfl = *pfEFlags; \
2963 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2964 uResult |= (a_uType)fInCarry << (cShift - 1); \
2965 \
2966 *puDst = uResult; \
2967 \
2968 /* Calc EFLAGS. */ \
2969 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2970 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2971 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2972 fEfl |= fOutCarry; \
2973 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2974 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
2975 else /* Intel 10980XE: According to the first sub-shift: */ \
2976 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2977 *pfEFlags = fEfl; \
2978 } \
2979}
2980
2981#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2982EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
2983#endif
2984EMIT_RCL(64, uint64_t, _intel, 1)
2985EMIT_RCL(64, uint64_t, _amd, 0)
2986
2987#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2988EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
2989#endif
2990EMIT_RCL(32, uint32_t, _intel, 1)
2991EMIT_RCL(32, uint32_t, _amd, 0)
2992
2993#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2994EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
2995#endif
2996EMIT_RCL(16, uint16_t, _intel, 1)
2997EMIT_RCL(16, uint16_t, _amd, 0)
2998
2999#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3000EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3001#endif
3002EMIT_RCL(8, uint8_t, _intel, 1)
3003EMIT_RCL(8, uint8_t, _amd, 0)
3004
3005
3006/*
3007 * RCR
3008 */
3009#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3010IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3011{ \
3012 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3013 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3014 cShift %= a_cBitsWidth + 1; \
3015 if (cShift) \
3016 { \
3017 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3018 cShift %= a_cBitsWidth + 1; \
3019 a_uType const uDst = *puDst; \
3020 a_uType uResult = uDst >> cShift; \
3021 if (cShift > 1) \
3022 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3023 \
3024 AssertCompile(X86_EFL_CF_BIT == 0); \
3025 uint32_t fEfl = *pfEFlags; \
3026 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3027 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3028 *puDst = uResult; \
3029 \
3030 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3031 it the same way as for 1 bit shifts. */ \
3032 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3033 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3034 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3035 fEfl |= fOutCarry; \
3036 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3037 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3038 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3039 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3040 *pfEFlags = fEfl; \
3041 } \
3042}
3043
3044#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3045EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3046#endif
3047EMIT_RCR(64, uint64_t, _intel, 1)
3048EMIT_RCR(64, uint64_t, _amd, 0)
3049
3050#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3051EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3052#endif
3053EMIT_RCR(32, uint32_t, _intel, 1)
3054EMIT_RCR(32, uint32_t, _amd, 0)
3055
3056#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3057EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3058#endif
3059EMIT_RCR(16, uint16_t, _intel, 1)
3060EMIT_RCR(16, uint16_t, _amd, 0)
3061
3062#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3063EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3064#endif
3065EMIT_RCR(8, uint8_t, _intel, 1)
3066EMIT_RCR(8, uint8_t, _amd, 0)
3067
3068
3069/*
3070 * SHL
3071 */
3072#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3073IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3074{ \
3075 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3076 if (cShift) \
3077 { \
3078 a_uType const uDst = *puDst; \
3079 a_uType uResult = uDst << cShift; \
3080 *puDst = uResult; \
3081 \
3082 /* Calc EFLAGS. */ \
3083 AssertCompile(X86_EFL_CF_BIT == 0); \
3084 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3085 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3086 fEfl |= fCarry; \
3087 if (!a_fIntelFlags) \
3088 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3089 else \
3090 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3091 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3092 fEfl |= X86_EFL_CALC_ZF(uResult); \
3093 fEfl |= g_afParity[uResult & 0xff]; \
3094 if (!a_fIntelFlags) \
3095 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3096 *pfEFlags = fEfl; \
3097 } \
3098}
3099
3100#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3101EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3102#endif
3103EMIT_SHL(64, uint64_t, _intel, 1)
3104EMIT_SHL(64, uint64_t, _amd, 0)
3105
3106#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3107EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3108#endif
3109EMIT_SHL(32, uint32_t, _intel, 1)
3110EMIT_SHL(32, uint32_t, _amd, 0)
3111
3112#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3113EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3114#endif
3115EMIT_SHL(16, uint16_t, _intel, 1)
3116EMIT_SHL(16, uint16_t, _amd, 0)
3117
3118#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3119EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3120#endif
3121EMIT_SHL(8, uint8_t, _intel, 1)
3122EMIT_SHL(8, uint8_t, _amd, 0)
3123
3124
3125/*
3126 * SHR
3127 */
3128#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3129IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3130{ \
3131 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3132 if (cShift) \
3133 { \
3134 a_uType const uDst = *puDst; \
3135 a_uType uResult = uDst >> cShift; \
3136 *puDst = uResult; \
3137 \
3138 /* Calc EFLAGS. */ \
3139 AssertCompile(X86_EFL_CF_BIT == 0); \
3140 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3141 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3142 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3143 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3144 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3145 fEfl |= X86_EFL_CALC_ZF(uResult); \
3146 fEfl |= g_afParity[uResult & 0xff]; \
3147 if (!a_fIntelFlags) \
3148 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3149 *pfEFlags = fEfl; \
3150 } \
3151}
3152
3153#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3154EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3155#endif
3156EMIT_SHR(64, uint64_t, _intel, 1)
3157EMIT_SHR(64, uint64_t, _amd, 0)
3158
3159#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3160EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3161#endif
3162EMIT_SHR(32, uint32_t, _intel, 1)
3163EMIT_SHR(32, uint32_t, _amd, 0)
3164
3165#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3166EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3167#endif
3168EMIT_SHR(16, uint16_t, _intel, 1)
3169EMIT_SHR(16, uint16_t, _amd, 0)
3170
3171#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3172EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3173#endif
3174EMIT_SHR(8, uint8_t, _intel, 1)
3175EMIT_SHR(8, uint8_t, _amd, 0)
3176
3177
3178/*
3179 * SAR
3180 */
3181#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3182IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3183{ \
3184 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3185 if (cShift) \
3186 { \
3187 a_iType const iDst = (a_iType)*puDst; \
3188 a_uType uResult = iDst >> cShift; \
3189 *puDst = uResult; \
3190 \
3191 /* Calc EFLAGS. \
3192 Note! The OF flag is always zero because the result never differs from the input. */ \
3193 AssertCompile(X86_EFL_CF_BIT == 0); \
3194 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3195 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3196 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3197 fEfl |= X86_EFL_CALC_ZF(uResult); \
3198 fEfl |= g_afParity[uResult & 0xff]; \
3199 if (!a_fIntelFlags) \
3200 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3201 *pfEFlags = fEfl; \
3202 } \
3203}
3204
3205#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3206EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3207#endif
3208EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3209EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3210
3211#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3212EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3213#endif
3214EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3215EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3216
3217#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3218EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3219#endif
3220EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3221EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3222
3223#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3224EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3225#endif
3226EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3227EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3228
3229
3230/*
3231 * SHLD
3232 *
3233 * - CF is the last bit shifted out of puDst.
3234 * - AF is always cleared by Intel 10980XE.
3235 * - AF is always set by AMD 3990X.
3236 * - OF is set according to the first shift on Intel 10980XE, it seems.
3237 * - OF is set according to the last sub-shift on AMD 3990X.
3238 * - ZF, SF and PF are calculated according to the result by both vendors.
3239 *
3240 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3241 * pick either the source register or the destination register for input bits
3242 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3243 * intel has changed behaviour here several times. We implement what current
3244 * skylake based does for now, we can extend this later as needed.
3245 */
3246#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3247IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3248 uint32_t *pfEFlags)) \
3249{ \
3250 cShift &= a_cBitsWidth - 1; \
3251 if (cShift) \
3252 { \
3253 a_uType const uDst = *puDst; \
3254 a_uType uResult = uDst << cShift; \
3255 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3256 *puDst = uResult; \
3257 \
3258 /* CALC EFLAGS: */ \
3259 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3260 if (a_fIntelFlags) \
3261 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3262 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3263 else \
3264 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3265 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3266 fEfl |= X86_EFL_AF; \
3267 } \
3268 AssertCompile(X86_EFL_CF_BIT == 0); \
3269 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3270 fEfl |= g_afParity[uResult & 0xff]; \
3271 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3272 fEfl |= X86_EFL_CALC_ZF(uResult); \
3273 *pfEFlags = fEfl; \
3274 } \
3275}
3276
3277#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3278EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3279#endif
3280EMIT_SHLD(64, uint64_t, _intel, 1)
3281EMIT_SHLD(64, uint64_t, _amd, 0)
3282
3283#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3284EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3285#endif
3286EMIT_SHLD(32, uint32_t, _intel, 1)
3287EMIT_SHLD(32, uint32_t, _amd, 0)
3288
3289#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3290IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3291{ \
3292 cShift &= 31; \
3293 if (cShift) \
3294 { \
3295 uint16_t const uDst = *puDst; \
3296 uint64_t const uTmp = a_fIntelFlags \
3297 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3298 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3299 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3300 *puDst = uResult; \
3301 \
3302 /* CALC EFLAGS: */ \
3303 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3304 AssertCompile(X86_EFL_CF_BIT == 0); \
3305 if (a_fIntelFlags) \
3306 { \
3307 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3308 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3309 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3310 } \
3311 else \
3312 { \
3313 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3314 if (cShift < 16) \
3315 { \
3316 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3317 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3318 } \
3319 else \
3320 { \
3321 if (cShift == 16) \
3322 fEfl |= uDst & X86_EFL_CF; \
3323 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3324 } \
3325 fEfl |= X86_EFL_AF; \
3326 } \
3327 fEfl |= g_afParity[uResult & 0xff]; \
3328 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3329 fEfl |= X86_EFL_CALC_ZF(uResult); \
3330 *pfEFlags = fEfl; \
3331 } \
3332}
3333
3334#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3335EMIT_SHLD_16(RT_NOTHING, 1)
3336#endif
3337EMIT_SHLD_16(_intel, 1)
3338EMIT_SHLD_16(_amd, 0)
3339
3340
3341/*
3342 * SHRD
3343 *
3344 * EFLAGS behaviour seems to be the same as with SHLD:
3345 * - CF is the last bit shifted out of puDst.
3346 * - AF is always cleared by Intel 10980XE.
3347 * - AF is always set by AMD 3990X.
3348 * - OF is set according to the first shift on Intel 10980XE, it seems.
3349 * - OF is set according to the last sub-shift on AMD 3990X.
3350 * - ZF, SF and PF are calculated according to the result by both vendors.
3351 *
3352 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3353 * pick either the source register or the destination register for input bits
3354 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3355 * intel has changed behaviour here several times. We implement what current
3356 * skylake based does for now, we can extend this later as needed.
3357 */
3358#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3359IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3360{ \
3361 cShift &= a_cBitsWidth - 1; \
3362 if (cShift) \
3363 { \
3364 a_uType const uDst = *puDst; \
3365 a_uType uResult = uDst >> cShift; \
3366 uResult |= uSrc << (a_cBitsWidth - cShift); \
3367 *puDst = uResult; \
3368 \
3369 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3370 AssertCompile(X86_EFL_CF_BIT == 0); \
3371 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3372 if (a_fIntelFlags) \
3373 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3374 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3375 else \
3376 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3377 if (cShift > 1) /* Set according to last shift. */ \
3378 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3379 else \
3380 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3381 fEfl |= X86_EFL_AF; \
3382 } \
3383 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3384 fEfl |= X86_EFL_CALC_ZF(uResult); \
3385 fEfl |= g_afParity[uResult & 0xff]; \
3386 *pfEFlags = fEfl; \
3387 } \
3388}
3389
3390#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3391EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3392#endif
3393EMIT_SHRD(64, uint64_t, _intel, 1)
3394EMIT_SHRD(64, uint64_t, _amd, 0)
3395
3396#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3397EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3398#endif
3399EMIT_SHRD(32, uint32_t, _intel, 1)
3400EMIT_SHRD(32, uint32_t, _amd, 0)
3401
3402#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3403IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3404{ \
3405 cShift &= 31; \
3406 if (cShift) \
3407 { \
3408 uint16_t const uDst = *puDst; \
3409 uint64_t const uTmp = a_fIntelFlags \
3410 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3411 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3412 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3413 *puDst = uResult; \
3414 \
3415 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3416 AssertCompile(X86_EFL_CF_BIT == 0); \
3417 if (a_fIntelFlags) \
3418 { \
3419 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3420 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3421 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3422 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3423 } \
3424 else \
3425 { \
3426 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3427 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3428 /* AMD 3990X: Set according to last shift. AF always set. */ \
3429 if (cShift > 1) /* Set according to last shift. */ \
3430 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3431 else \
3432 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3433 fEfl |= X86_EFL_AF; \
3434 } \
3435 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3436 fEfl |= X86_EFL_CALC_ZF(uResult); \
3437 fEfl |= g_afParity[uResult & 0xff]; \
3438 *pfEFlags = fEfl; \
3439 } \
3440}
3441
3442#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3443EMIT_SHRD_16(RT_NOTHING, 1)
3444#endif
3445EMIT_SHRD_16(_intel, 1)
3446EMIT_SHRD_16(_amd, 0)
3447
3448
3449/*
3450 * RORX (BMI2)
3451 */
3452#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3453IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3454{ \
3455 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3456}
3457
3458#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3459EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3460#endif
3461#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3462EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3463#endif
3464
3465
3466/*
3467 * SHLX (BMI2)
3468 */
3469#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3470IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3471{ \
3472 cShift &= a_cBitsWidth - 1; \
3473 *puDst = uSrc << cShift; \
3474}
3475
3476#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3477EMIT_SHLX(64, uint64_t, RT_NOTHING)
3478EMIT_SHLX(64, uint64_t, _fallback)
3479#endif
3480#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3481EMIT_SHLX(32, uint32_t, RT_NOTHING)
3482EMIT_SHLX(32, uint32_t, _fallback)
3483#endif
3484
3485
3486/*
3487 * SHRX (BMI2)
3488 */
3489#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3490IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3491{ \
3492 cShift &= a_cBitsWidth - 1; \
3493 *puDst = uSrc >> cShift; \
3494}
3495
3496#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3497EMIT_SHRX(64, uint64_t, RT_NOTHING)
3498EMIT_SHRX(64, uint64_t, _fallback)
3499#endif
3500#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3501EMIT_SHRX(32, uint32_t, RT_NOTHING)
3502EMIT_SHRX(32, uint32_t, _fallback)
3503#endif
3504
3505
3506/*
3507 * SARX (BMI2)
3508 */
3509#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3510IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3511{ \
3512 cShift &= a_cBitsWidth - 1; \
3513 *puDst = (a_iType)uSrc >> cShift; \
3514}
3515
3516#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3517EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3518EMIT_SARX(64, uint64_t, int64_t, _fallback)
3519#endif
3520#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3521EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3522EMIT_SARX(32, uint32_t, int32_t, _fallback)
3523#endif
3524
3525
3526/*
3527 * PDEP (BMI2)
3528 */
3529#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3530IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3531{ \
3532 a_uType uResult = 0; \
3533 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3534 if (fMask & ((a_uType)1 << iMaskBit)) \
3535 { \
3536 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3537 iBit++; \
3538 } \
3539 *puDst = uResult; \
3540}
3541
3542#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3543EMIT_PDEP(64, uint64_t, RT_NOTHING)
3544#endif
3545EMIT_PDEP(64, uint64_t, _fallback)
3546#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3547EMIT_PDEP(32, uint32_t, RT_NOTHING)
3548#endif
3549EMIT_PDEP(32, uint32_t, _fallback)
3550
3551/*
3552 * PEXT (BMI2)
3553 */
3554#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PEXT(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PEXT(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PEXT(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PEXT(32, uint32_t, _fallback)
3575
3576
3577#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3578
3579# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3580/*
3581 * BSWAP
3582 */
3583
3584IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3585{
3586 *puDst = ASMByteSwapU64(*puDst);
3587}
3588
3589
3590IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3591{
3592 *puDst = ASMByteSwapU32(*puDst);
3593}
3594
3595
3596/* Note! undocument, so 32-bit arg */
3597IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3598{
3599#if 0
3600 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3601#else
3602 /* This is the behaviour AMD 3990x (64-bit mode): */
3603 *(uint16_t *)puDst = 0;
3604#endif
3605}
3606
3607# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3608
3609
3610
3611# if defined(IEM_WITHOUT_ASSEMBLY)
3612
3613/*
3614 * LFENCE, SFENCE & MFENCE.
3615 */
3616
3617IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3618{
3619 ASMReadFence();
3620}
3621
3622
3623IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3624{
3625 ASMWriteFence();
3626}
3627
3628
3629IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3630{
3631 ASMMemoryFence();
3632}
3633
3634
3635# ifndef RT_ARCH_ARM64
3636IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3637{
3638 ASMMemoryFence();
3639}
3640# endif
3641
3642# endif
3643
3644#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3645
3646
3647IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3648{
3649 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3650 {
3651 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3652 *pu16Dst |= u16Src & X86_SEL_RPL;
3653
3654 *pfEFlags |= X86_EFL_ZF;
3655 }
3656 else
3657 *pfEFlags &= ~X86_EFL_ZF;
3658}
3659
3660
3661#if defined(IEM_WITHOUT_ASSEMBLY)
3662
3663/*********************************************************************************************************************************
3664* x87 FPU Loads *
3665*********************************************************************************************************************************/
3666
3667IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3668{
3669 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3670 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3671 {
3672 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3673 pFpuRes->r80Result.sj64.fInteger = 1;
3674 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3675 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3676 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3677 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3678 }
3679 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3680 {
3681 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3682 pFpuRes->r80Result.s.uExponent = 0;
3683 pFpuRes->r80Result.s.uMantissa = 0;
3684 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3685 }
3686 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3687 {
3688 /* Subnormal values gets normalized. */
3689 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3690 pFpuRes->r80Result.sj64.fInteger = 1;
3691 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3692 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3693 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3694 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3695 pFpuRes->FSW |= X86_FSW_DE;
3696 if (!(pFpuState->FCW & X86_FCW_DM))
3697 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3698 }
3699 else if (RTFLOAT32U_IS_INF(pr32Val))
3700 {
3701 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3702 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3703 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3704 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3705 }
3706 else
3707 {
3708 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3709 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3710 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3711 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3712 pFpuRes->r80Result.sj64.fInteger = 1;
3713 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3714 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3715 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3716 {
3717 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3718 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3719 pFpuRes->FSW |= X86_FSW_IE;
3720
3721 if (!(pFpuState->FCW & X86_FCW_IM))
3722 {
3723 /* The value is not pushed. */
3724 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3725 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3726 pFpuRes->r80Result.au64[0] = 0;
3727 pFpuRes->r80Result.au16[4] = 0;
3728 }
3729 }
3730 else
3731 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3732 }
3733}
3734
3735
3736IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3737{
3738 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3739 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3740 {
3741 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3742 pFpuRes->r80Result.sj64.fInteger = 1;
3743 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3744 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3745 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3746 }
3747 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3748 {
3749 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3750 pFpuRes->r80Result.s.uExponent = 0;
3751 pFpuRes->r80Result.s.uMantissa = 0;
3752 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3753 }
3754 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3755 {
3756 /* Subnormal values gets normalized. */
3757 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3758 pFpuRes->r80Result.sj64.fInteger = 1;
3759 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3760 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3761 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3762 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3763 pFpuRes->FSW |= X86_FSW_DE;
3764 if (!(pFpuState->FCW & X86_FCW_DM))
3765 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3766 }
3767 else if (RTFLOAT64U_IS_INF(pr64Val))
3768 {
3769 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3770 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3771 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3772 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3773 }
3774 else
3775 {
3776 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3777 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3778 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3779 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3780 pFpuRes->r80Result.sj64.fInteger = 1;
3781 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3782 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3783 {
3784 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3785 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3786 pFpuRes->FSW |= X86_FSW_IE;
3787
3788 if (!(pFpuState->FCW & X86_FCW_IM))
3789 {
3790 /* The value is not pushed. */
3791 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3792 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3793 pFpuRes->r80Result.au64[0] = 0;
3794 pFpuRes->r80Result.au16[4] = 0;
3795 }
3796 }
3797 else
3798 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3799 }
3800}
3801
3802
3803IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3804{
3805 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3806 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3807 /* Raises no exceptions. */
3808 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3809}
3810
3811
3812IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3813{
3814 pFpuRes->r80Result.sj64.fSign = 0;
3815 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3816 pFpuRes->r80Result.sj64.fInteger = 1;
3817 pFpuRes->r80Result.sj64.uFraction = 0;
3818
3819 /*
3820 * FPU status word:
3821 * - TOP is irrelevant, but we must match x86 assembly version.
3822 * - C1 is always cleared as we don't have any stack overflows.
3823 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3824 */
3825 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3826}
3827
3828
3829IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3830{
3831 pFpuRes->r80Result.sj64.fSign = 0;
3832 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3833 pFpuRes->r80Result.sj64.fInteger = 1;
3834 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3835 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3836 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3837 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3838}
3839
3840
3841IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3842{
3843 pFpuRes->r80Result.sj64.fSign = 0;
3844 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3845 pFpuRes->r80Result.sj64.fInteger = 1;
3846 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3847 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3848 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3849}
3850
3851
3852IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3853{
3854 pFpuRes->r80Result.sj64.fSign = 0;
3855 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3856 pFpuRes->r80Result.sj64.fInteger = 1;
3857 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3858 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3859 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3860 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3861}
3862
3863
3864IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3865{
3866 pFpuRes->r80Result.sj64.fSign = 0;
3867 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3868 pFpuRes->r80Result.sj64.fInteger = 1;
3869 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3870 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3871 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3872 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3873}
3874
3875
3876IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3877{
3878 pFpuRes->r80Result.sj64.fSign = 0;
3879 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3880 pFpuRes->r80Result.sj64.fInteger = 1;
3881 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3882 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3883 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3884 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3885}
3886
3887
3888IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3889{
3890 pFpuRes->r80Result.s.fSign = 0;
3891 pFpuRes->r80Result.s.uExponent = 0;
3892 pFpuRes->r80Result.s.uMantissa = 0;
3893 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3894}
3895
3896#define EMIT_FILD(a_cBits) \
3897IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3898 int ## a_cBits ## _t const *piVal)) \
3899{ \
3900 int ## a_cBits ## _t iVal = *piVal; \
3901 if (iVal == 0) \
3902 { \
3903 pFpuRes->r80Result.s.fSign = 0; \
3904 pFpuRes->r80Result.s.uExponent = 0; \
3905 pFpuRes->r80Result.s.uMantissa = 0; \
3906 } \
3907 else \
3908 { \
3909 if (iVal > 0) \
3910 pFpuRes->r80Result.s.fSign = 0; \
3911 else \
3912 { \
3913 pFpuRes->r80Result.s.fSign = 1; \
3914 iVal = -iVal; \
3915 } \
3916 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3917 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3918 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3919 } \
3920 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3921}
3922EMIT_FILD(16)
3923EMIT_FILD(32)
3924EMIT_FILD(64)
3925
3926
3927IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3928{
3929 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3930 if ( pd80Val->s.abPairs[0] == 0
3931 && pd80Val->s.abPairs[1] == 0
3932 && pd80Val->s.abPairs[2] == 0
3933 && pd80Val->s.abPairs[3] == 0
3934 && pd80Val->s.abPairs[4] == 0
3935 && pd80Val->s.abPairs[5] == 0
3936 && pd80Val->s.abPairs[6] == 0
3937 && pd80Val->s.abPairs[7] == 0
3938 && pd80Val->s.abPairs[8] == 0)
3939 {
3940 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3941 pFpuRes->r80Result.s.uExponent = 0;
3942 pFpuRes->r80Result.s.uMantissa = 0;
3943 }
3944 else
3945 {
3946 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3947
3948 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3949 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3950 cPairs--;
3951
3952 uint64_t uVal = 0;
3953 uint64_t uFactor = 1;
3954 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3955 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3956 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3957
3958 unsigned const cBits = ASMBitLastSetU64(uVal);
3959 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3960 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3961 }
3962}
3963
3964
3965/*********************************************************************************************************************************
3966* x87 FPU Stores *
3967*********************************************************************************************************************************/
3968
3969/**
3970 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3971 *
3972 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3973 *
3974 * @returns Updated FPU status word value.
3975 * @param fSignIn Incoming sign indicator.
3976 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
3977 * @param iExponentIn Unbiased exponent.
3978 * @param fFcw The FPU control word.
3979 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
3980 * @param pr32Dst Where to return the output value, if one should be
3981 * returned.
3982 *
3983 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
3984 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
3985 */
3986static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
3987 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
3988{
3989 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
3990 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3991 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
3992 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
3993 ? fRoundingOffMask
3994 : 0;
3995 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
3996
3997 /*
3998 * Deal with potential overflows/underflows first, optimizing for none.
3999 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4000 */
4001 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4002 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4003 { /* likely? */ }
4004 /*
4005 * Underflow if the exponent zero or negative. This is attempted mapped
4006 * to a subnormal number when possible, with some additional trickery ofc.
4007 */
4008 else if (iExponentOut <= 0)
4009 {
4010 bool const fIsTiny = iExponentOut < 0
4011 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4012 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4013 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4014 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4015
4016 if (iExponentOut <= 0)
4017 {
4018 uMantissaIn = iExponentOut <= -63
4019 ? uMantissaIn != 0
4020 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4021 fRoundedOff = uMantissaIn & fRoundingOffMask;
4022 if (fRoundedOff && fIsTiny)
4023 fFsw |= X86_FSW_UE;
4024 iExponentOut = 0;
4025 }
4026 }
4027 /*
4028 * Overflow if at or above max exponent value or if we will reach max
4029 * when rounding. Will return +/-zero or +/-max value depending on
4030 * whether we're rounding or not.
4031 */
4032 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4033 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4034 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4035 {
4036 fFsw |= X86_FSW_OE;
4037 if (!(fFcw & X86_FCW_OM))
4038 return fFsw | X86_FSW_ES | X86_FSW_B;
4039 fFsw |= X86_FSW_PE;
4040 if (uRoundingAdd)
4041 fFsw |= X86_FSW_C1;
4042 if (!(fFcw & X86_FCW_PM))
4043 fFsw |= X86_FSW_ES | X86_FSW_B;
4044
4045 pr32Dst->s.fSign = fSignIn;
4046 if (uRoundingAdd)
4047 { /* Zero */
4048 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4049 pr32Dst->s.uFraction = 0;
4050 }
4051 else
4052 { /* Max */
4053 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4054 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4055 }
4056 return fFsw;
4057 }
4058
4059 /*
4060 * Normal or subnormal number.
4061 */
4062 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4063 uint64_t uMantissaOut = uMantissaIn;
4064 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4065 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4066 || fRoundedOff != uRoundingAdd)
4067 {
4068 uMantissaOut = uMantissaIn + uRoundingAdd;
4069 if (uMantissaOut >= uMantissaIn)
4070 { /* likely */ }
4071 else
4072 {
4073 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4074 iExponentOut++;
4075 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4076 fFsw |= X86_FSW_C1;
4077 }
4078 }
4079 else
4080 uMantissaOut = uMantissaIn;
4081
4082 /* Truncate the mantissa and set the return value. */
4083 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4084
4085 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4086 pr32Dst->s.uExponent = iExponentOut;
4087 pr32Dst->s.fSign = fSignIn;
4088
4089 /* Set status flags realted to rounding. */
4090 if (fRoundedOff)
4091 {
4092 fFsw |= X86_FSW_PE;
4093 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4094 fFsw |= X86_FSW_C1;
4095 if (!(fFcw & X86_FCW_PM))
4096 fFsw |= X86_FSW_ES | X86_FSW_B;
4097 }
4098
4099 return fFsw;
4100}
4101
4102
4103/**
4104 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4105 */
4106IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4107 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4108{
4109 uint16_t const fFcw = pFpuState->FCW;
4110 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4111 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4112 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4113 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4114 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4115 {
4116 pr32Dst->s.fSign = pr80Src->s.fSign;
4117 pr32Dst->s.uExponent = 0;
4118 pr32Dst->s.uFraction = 0;
4119 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4120 }
4121 else if (RTFLOAT80U_IS_INF(pr80Src))
4122 {
4123 pr32Dst->s.fSign = pr80Src->s.fSign;
4124 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4125 pr32Dst->s.uFraction = 0;
4126 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4127 }
4128 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4129 {
4130 /* Mapped to +/-QNaN */
4131 pr32Dst->s.fSign = pr80Src->s.fSign;
4132 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4133 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4134 }
4135 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4136 {
4137 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4138 if (fFcw & X86_FCW_IM)
4139 {
4140 pr32Dst->s.fSign = 1;
4141 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4142 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4143 fFsw |= X86_FSW_IE;
4144 }
4145 else
4146 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4147 }
4148 else if (RTFLOAT80U_IS_NAN(pr80Src))
4149 {
4150 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4151 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4152 {
4153 pr32Dst->s.fSign = pr80Src->s.fSign;
4154 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4155 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4156 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4157 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4158 fFsw |= X86_FSW_IE;
4159 }
4160 else
4161 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4162 }
4163 else
4164 {
4165 /* Denormal values causes both an underflow and precision exception. */
4166 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4167 if (fFcw & X86_FCW_UM)
4168 {
4169 pr32Dst->s.fSign = pr80Src->s.fSign;
4170 pr32Dst->s.uExponent = 0;
4171 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4172 {
4173 pr32Dst->s.uFraction = 1;
4174 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4175 if (!(fFcw & X86_FCW_PM))
4176 fFsw |= X86_FSW_ES | X86_FSW_B;
4177 }
4178 else
4179 {
4180 pr32Dst->s.uFraction = 0;
4181 fFsw |= X86_FSW_UE | X86_FSW_PE;
4182 if (!(fFcw & X86_FCW_PM))
4183 fFsw |= X86_FSW_ES | X86_FSW_B;
4184 }
4185 }
4186 else
4187 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4188 }
4189 *pu16FSW = fFsw;
4190}
4191
4192
4193/**
4194 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4195 *
4196 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4197 *
4198 * @returns Updated FPU status word value.
4199 * @param fSignIn Incoming sign indicator.
4200 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4201 * @param iExponentIn Unbiased exponent.
4202 * @param fFcw The FPU control word.
4203 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4204 * @param pr64Dst Where to return the output value, if one should be
4205 * returned.
4206 *
4207 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4208 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4209 */
4210static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4211 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4212{
4213 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4214 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4215 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4216 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4217 ? fRoundingOffMask
4218 : 0;
4219 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4220
4221 /*
4222 * Deal with potential overflows/underflows first, optimizing for none.
4223 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4224 */
4225 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4226 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4227 { /* likely? */ }
4228 /*
4229 * Underflow if the exponent zero or negative. This is attempted mapped
4230 * to a subnormal number when possible, with some additional trickery ofc.
4231 */
4232 else if (iExponentOut <= 0)
4233 {
4234 bool const fIsTiny = iExponentOut < 0
4235 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4236 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4237 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4238 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4239
4240 if (iExponentOut <= 0)
4241 {
4242 uMantissaIn = iExponentOut <= -63
4243 ? uMantissaIn != 0
4244 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4245 fRoundedOff = uMantissaIn & fRoundingOffMask;
4246 if (fRoundedOff && fIsTiny)
4247 fFsw |= X86_FSW_UE;
4248 iExponentOut = 0;
4249 }
4250 }
4251 /*
4252 * Overflow if at or above max exponent value or if we will reach max
4253 * when rounding. Will return +/-zero or +/-max value depending on
4254 * whether we're rounding or not.
4255 */
4256 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4257 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4258 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4259 {
4260 fFsw |= X86_FSW_OE;
4261 if (!(fFcw & X86_FCW_OM))
4262 return fFsw | X86_FSW_ES | X86_FSW_B;
4263 fFsw |= X86_FSW_PE;
4264 if (uRoundingAdd)
4265 fFsw |= X86_FSW_C1;
4266 if (!(fFcw & X86_FCW_PM))
4267 fFsw |= X86_FSW_ES | X86_FSW_B;
4268
4269 pr64Dst->s64.fSign = fSignIn;
4270 if (uRoundingAdd)
4271 { /* Zero */
4272 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4273 pr64Dst->s64.uFraction = 0;
4274 }
4275 else
4276 { /* Max */
4277 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4278 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4279 }
4280 return fFsw;
4281 }
4282
4283 /*
4284 * Normal or subnormal number.
4285 */
4286 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4287 uint64_t uMantissaOut = uMantissaIn;
4288 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4289 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4290 || fRoundedOff != uRoundingAdd)
4291 {
4292 uMantissaOut = uMantissaIn + uRoundingAdd;
4293 if (uMantissaOut >= uMantissaIn)
4294 { /* likely */ }
4295 else
4296 {
4297 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4298 iExponentOut++;
4299 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4300 fFsw |= X86_FSW_C1;
4301 }
4302 }
4303 else
4304 uMantissaOut = uMantissaIn;
4305
4306 /* Truncate the mantissa and set the return value. */
4307 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4308
4309 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4310 pr64Dst->s64.uExponent = iExponentOut;
4311 pr64Dst->s64.fSign = fSignIn;
4312
4313 /* Set status flags realted to rounding. */
4314 if (fRoundedOff)
4315 {
4316 fFsw |= X86_FSW_PE;
4317 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4318 fFsw |= X86_FSW_C1;
4319 if (!(fFcw & X86_FCW_PM))
4320 fFsw |= X86_FSW_ES | X86_FSW_B;
4321 }
4322
4323 return fFsw;
4324}
4325
4326
4327/**
4328 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4329 */
4330IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4331 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4332{
4333 uint16_t const fFcw = pFpuState->FCW;
4334 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4335 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4336 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4337 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4338 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4339 {
4340 pr64Dst->s64.fSign = pr80Src->s.fSign;
4341 pr64Dst->s64.uExponent = 0;
4342 pr64Dst->s64.uFraction = 0;
4343 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4344 }
4345 else if (RTFLOAT80U_IS_INF(pr80Src))
4346 {
4347 pr64Dst->s64.fSign = pr80Src->s.fSign;
4348 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4349 pr64Dst->s64.uFraction = 0;
4350 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4351 }
4352 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4353 {
4354 /* Mapped to +/-QNaN */
4355 pr64Dst->s64.fSign = pr80Src->s.fSign;
4356 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4357 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4358 }
4359 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4360 {
4361 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4362 if (fFcw & X86_FCW_IM)
4363 {
4364 pr64Dst->s64.fSign = 1;
4365 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4366 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4367 fFsw |= X86_FSW_IE;
4368 }
4369 else
4370 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4371 }
4372 else if (RTFLOAT80U_IS_NAN(pr80Src))
4373 {
4374 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4375 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4376 {
4377 pr64Dst->s64.fSign = pr80Src->s.fSign;
4378 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4379 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4380 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4381 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4382 fFsw |= X86_FSW_IE;
4383 }
4384 else
4385 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4386 }
4387 else
4388 {
4389 /* Denormal values causes both an underflow and precision exception. */
4390 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4391 if (fFcw & X86_FCW_UM)
4392 {
4393 pr64Dst->s64.fSign = pr80Src->s.fSign;
4394 pr64Dst->s64.uExponent = 0;
4395 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4396 {
4397 pr64Dst->s64.uFraction = 1;
4398 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4399 if (!(fFcw & X86_FCW_PM))
4400 fFsw |= X86_FSW_ES | X86_FSW_B;
4401 }
4402 else
4403 {
4404 pr64Dst->s64.uFraction = 0;
4405 fFsw |= X86_FSW_UE | X86_FSW_PE;
4406 if (!(fFcw & X86_FCW_PM))
4407 fFsw |= X86_FSW_ES | X86_FSW_B;
4408 }
4409 }
4410 else
4411 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4412 }
4413 *pu16FSW = fFsw;
4414}
4415
4416
4417IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4418 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4419{
4420 /*
4421 * FPU status word:
4422 * - TOP is irrelevant, but we must match x86 assembly version (0).
4423 * - C1 is always cleared as we don't have any stack overflows.
4424 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4425 */
4426 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4427 *pr80Dst = *pr80Src;
4428}
4429
4430
4431/*
4432 *
4433 * Mantissa:
4434 * 63 56 48 40 32 24 16 8 0
4435 * v v v v v v v v v
4436 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4437 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4438 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4439 *
4440 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4441 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4442 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4443 * where we'll drop off all but bit 63.
4444 */
4445#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4446IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4447 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4448{ \
4449 uint16_t const fFcw = pFpuState->FCW; \
4450 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4451 bool const fSignIn = pr80Val->s.fSign; \
4452 \
4453 /* \
4454 * Deal with normal numbers first. \
4455 */ \
4456 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4457 { \
4458 uint64_t uMantissa = pr80Val->s.uMantissa; \
4459 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4460 \
4461 if ((uint32_t)iExponent <= a_cBits - 2) \
4462 { \
4463 unsigned const cShiftOff = 63 - iExponent; \
4464 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4465 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4466 ? RT_BIT_64(cShiftOff - 1) \
4467 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4468 ? fRoundingOffMask \
4469 : 0; \
4470 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4471 \
4472 uMantissa >>= cShiftOff; \
4473 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4474 uMantissa += uRounding; \
4475 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4476 { \
4477 if (fRoundedOff) \
4478 { \
4479 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4480 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4481 else if (uRounding) \
4482 fFsw |= X86_FSW_C1; \
4483 fFsw |= X86_FSW_PE; \
4484 if (!(fFcw & X86_FCW_PM)) \
4485 fFsw |= X86_FSW_ES | X86_FSW_B; \
4486 } \
4487 \
4488 if (!fSignIn) \
4489 *piDst = (a_iType)uMantissa; \
4490 else \
4491 *piDst = -(a_iType)uMantissa; \
4492 } \
4493 else \
4494 { \
4495 /* overflowed after rounding. */ \
4496 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4497 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4498 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4499 \
4500 /* Special case for the integer minimum value. */ \
4501 if (fSignIn) \
4502 { \
4503 *piDst = a_iTypeMin; \
4504 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4505 if (!(fFcw & X86_FCW_PM)) \
4506 fFsw |= X86_FSW_ES | X86_FSW_B; \
4507 } \
4508 else \
4509 { \
4510 fFsw |= X86_FSW_IE; \
4511 if (fFcw & X86_FCW_IM) \
4512 *piDst = a_iTypeMin; \
4513 else \
4514 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4515 } \
4516 } \
4517 } \
4518 /* \
4519 * Tiny sub-zero numbers. \
4520 */ \
4521 else if (iExponent < 0) \
4522 { \
4523 if (!fSignIn) \
4524 { \
4525 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4526 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4527 { \
4528 *piDst = 1; \
4529 fFsw |= X86_FSW_C1; \
4530 } \
4531 else \
4532 *piDst = 0; \
4533 } \
4534 else \
4535 { \
4536 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4537 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4538 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4539 *piDst = 0; \
4540 else \
4541 { \
4542 *piDst = -1; \
4543 fFsw |= X86_FSW_C1; \
4544 } \
4545 } \
4546 fFsw |= X86_FSW_PE; \
4547 if (!(fFcw & X86_FCW_PM)) \
4548 fFsw |= X86_FSW_ES | X86_FSW_B; \
4549 } \
4550 /* \
4551 * Special MIN case. \
4552 */ \
4553 else if ( fSignIn && iExponent == a_cBits - 1 \
4554 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4555 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4556 : uMantissa == RT_BIT_64(63))) \
4557 { \
4558 *piDst = a_iTypeMin; \
4559 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4560 { \
4561 fFsw |= X86_FSW_PE; \
4562 if (!(fFcw & X86_FCW_PM)) \
4563 fFsw |= X86_FSW_ES | X86_FSW_B; \
4564 } \
4565 } \
4566 /* \
4567 * Too large/small number outside the target integer range. \
4568 */ \
4569 else \
4570 { \
4571 fFsw |= X86_FSW_IE; \
4572 if (fFcw & X86_FCW_IM) \
4573 *piDst = a_iTypeIndefinite; \
4574 else \
4575 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4576 } \
4577 } \
4578 /* \
4579 * Map both +0 and -0 to integer zero (signless/+). \
4580 */ \
4581 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4582 *piDst = 0; \
4583 /* \
4584 * Denormals are just really tiny sub-zero numbers that are either rounded \
4585 * to zero, 1 or -1 depending on sign and rounding control. \
4586 */ \
4587 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4588 { \
4589 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4590 *piDst = 0; \
4591 else \
4592 { \
4593 *piDst = fSignIn ? -1 : 1; \
4594 fFsw |= X86_FSW_C1; \
4595 } \
4596 fFsw |= X86_FSW_PE; \
4597 if (!(fFcw & X86_FCW_PM)) \
4598 fFsw |= X86_FSW_ES | X86_FSW_B; \
4599 } \
4600 /* \
4601 * All other special values are considered invalid arguments and result \
4602 * in an IE exception and indefinite value if masked. \
4603 */ \
4604 else \
4605 { \
4606 fFsw |= X86_FSW_IE; \
4607 if (fFcw & X86_FCW_IM) \
4608 *piDst = a_iTypeIndefinite; \
4609 else \
4610 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4611 } \
4612 *pu16FSW = fFsw; \
4613}
4614EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4615EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4616EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4617
4618#endif /*IEM_WITHOUT_ASSEMBLY */
4619
4620
4621/*
4622 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4623 *
4624 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4625 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4626 * thus the @a a_cBitsIn.
4627 */
4628#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4629IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4630 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4631{ \
4632 uint16_t const fFcw = pFpuState->FCW; \
4633 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4634 bool const fSignIn = pr80Val->s.fSign; \
4635 \
4636 /* \
4637 * Deal with normal numbers first. \
4638 */ \
4639 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4640 { \
4641 uint64_t uMantissa = pr80Val->s.uMantissa; \
4642 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4643 \
4644 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4645 { \
4646 unsigned const cShiftOff = 63 - iExponent; \
4647 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4648 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4649 uMantissa >>= cShiftOff; \
4650 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4651 if (!fSignIn) \
4652 *piDst = (a_iType)uMantissa; \
4653 else \
4654 *piDst = -(a_iType)uMantissa; \
4655 \
4656 if (fRoundedOff) \
4657 { \
4658 fFsw |= X86_FSW_PE; \
4659 if (!(fFcw & X86_FCW_PM)) \
4660 fFsw |= X86_FSW_ES | X86_FSW_B; \
4661 } \
4662 } \
4663 /* \
4664 * Tiny sub-zero numbers. \
4665 */ \
4666 else if (iExponent < 0) \
4667 { \
4668 *piDst = 0; \
4669 fFsw |= X86_FSW_PE; \
4670 if (!(fFcw & X86_FCW_PM)) \
4671 fFsw |= X86_FSW_ES | X86_FSW_B; \
4672 } \
4673 /* \
4674 * Special MIN case. \
4675 */ \
4676 else if ( fSignIn && iExponent == a_cBits - 1 \
4677 && (a_cBits < 64 \
4678 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4679 : uMantissa == RT_BIT_64(63)) ) \
4680 { \
4681 *piDst = a_iTypeMin; \
4682 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4683 { \
4684 fFsw |= X86_FSW_PE; \
4685 if (!(fFcw & X86_FCW_PM)) \
4686 fFsw |= X86_FSW_ES | X86_FSW_B; \
4687 } \
4688 } \
4689 /* \
4690 * Figure this weirdness. \
4691 */ \
4692 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4693 { \
4694 *piDst = 0; \
4695 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4696 { \
4697 fFsw |= X86_FSW_PE; \
4698 if (!(fFcw & X86_FCW_PM)) \
4699 fFsw |= X86_FSW_ES | X86_FSW_B; \
4700 } \
4701 } \
4702 /* \
4703 * Too large/small number outside the target integer range. \
4704 */ \
4705 else \
4706 { \
4707 fFsw |= X86_FSW_IE; \
4708 if (fFcw & X86_FCW_IM) \
4709 *piDst = a_iTypeIndefinite; \
4710 else \
4711 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4712 } \
4713 } \
4714 /* \
4715 * Map both +0 and -0 to integer zero (signless/+). \
4716 */ \
4717 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4718 *piDst = 0; \
4719 /* \
4720 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4721 */ \
4722 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4723 { \
4724 *piDst = 0; \
4725 fFsw |= X86_FSW_PE; \
4726 if (!(fFcw & X86_FCW_PM)) \
4727 fFsw |= X86_FSW_ES | X86_FSW_B; \
4728 } \
4729 /* \
4730 * All other special values are considered invalid arguments and result \
4731 * in an IE exception and indefinite value if masked. \
4732 */ \
4733 else \
4734 { \
4735 fFsw |= X86_FSW_IE; \
4736 if (fFcw & X86_FCW_IM) \
4737 *piDst = a_iTypeIndefinite; \
4738 else \
4739 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4740 } \
4741 *pu16FSW = fFsw; \
4742}
4743#if defined(IEM_WITHOUT_ASSEMBLY)
4744EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4745EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4746EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4747#endif
4748EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4749EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4750
4751
4752#if defined(IEM_WITHOUT_ASSEMBLY)
4753
4754IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4755 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4756{
4757 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4758 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4759 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4760 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4761 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4762
4763 uint16_t const fFcw = pFpuState->FCW;
4764 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4765 bool const fSignIn = pr80Src->s.fSign;
4766
4767 /*
4768 * Deal with normal numbers first.
4769 */
4770 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4771 {
4772 uint64_t uMantissa = pr80Src->s.uMantissa;
4773 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4774 if ( (uint32_t)iExponent <= 58
4775 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4776 {
4777 unsigned const cShiftOff = 63 - iExponent;
4778 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4779 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4780 ? RT_BIT_64(cShiftOff - 1)
4781 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4782 ? fRoundingOffMask
4783 : 0;
4784 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4785
4786 uMantissa >>= cShiftOff;
4787 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4788 uMantissa += uRounding;
4789 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4790 {
4791 if (fRoundedOff)
4792 {
4793 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4794 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4795 else if (uRounding)
4796 fFsw |= X86_FSW_C1;
4797 fFsw |= X86_FSW_PE;
4798 if (!(fFcw & X86_FCW_PM))
4799 fFsw |= X86_FSW_ES | X86_FSW_B;
4800 }
4801
4802 pd80Dst->s.fSign = fSignIn;
4803 pd80Dst->s.uPad = 0;
4804 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4805 {
4806 unsigned const uDigits = uMantissa % 100;
4807 uMantissa /= 100;
4808 uint8_t const bLo = uDigits % 10;
4809 uint8_t const bHi = uDigits / 10;
4810 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4811 }
4812 }
4813 else
4814 {
4815 /* overflowed after rounding. */
4816 fFsw |= X86_FSW_IE;
4817 if (fFcw & X86_FCW_IM)
4818 *pd80Dst = s_d80Indefinite;
4819 else
4820 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4821 }
4822 }
4823 /*
4824 * Tiny sub-zero numbers.
4825 */
4826 else if (iExponent < 0)
4827 {
4828 if (!fSignIn)
4829 {
4830 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4831 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4832 {
4833 *pd80Dst = s_ad80One[fSignIn];
4834 fFsw |= X86_FSW_C1;
4835 }
4836 else
4837 *pd80Dst = s_ad80Zeros[fSignIn];
4838 }
4839 else
4840 {
4841 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4842 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4843 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4844 *pd80Dst = s_ad80Zeros[fSignIn];
4845 else
4846 {
4847 *pd80Dst = s_ad80One[fSignIn];
4848 fFsw |= X86_FSW_C1;
4849 }
4850 }
4851 fFsw |= X86_FSW_PE;
4852 if (!(fFcw & X86_FCW_PM))
4853 fFsw |= X86_FSW_ES | X86_FSW_B;
4854 }
4855 /*
4856 * Too large/small number outside the target integer range.
4857 */
4858 else
4859 {
4860 fFsw |= X86_FSW_IE;
4861 if (fFcw & X86_FCW_IM)
4862 *pd80Dst = s_d80Indefinite;
4863 else
4864 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4865 }
4866 }
4867 /*
4868 * Map both +0 and -0 to integer zero (signless/+).
4869 */
4870 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4871 *pd80Dst = s_ad80Zeros[fSignIn];
4872 /*
4873 * Denormals are just really tiny sub-zero numbers that are either rounded
4874 * to zero, 1 or -1 depending on sign and rounding control.
4875 */
4876 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4877 {
4878 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4879 *pd80Dst = s_ad80Zeros[fSignIn];
4880 else
4881 {
4882 *pd80Dst = s_ad80One[fSignIn];
4883 fFsw |= X86_FSW_C1;
4884 }
4885 fFsw |= X86_FSW_PE;
4886 if (!(fFcw & X86_FCW_PM))
4887 fFsw |= X86_FSW_ES | X86_FSW_B;
4888 }
4889 /*
4890 * All other special values are considered invalid arguments and result
4891 * in an IE exception and indefinite value if masked.
4892 */
4893 else
4894 {
4895 fFsw |= X86_FSW_IE;
4896 if (fFcw & X86_FCW_IM)
4897 *pd80Dst = s_d80Indefinite;
4898 else
4899 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4900 }
4901 *pu16FSW = fFsw;
4902}
4903
4904
4905/*********************************************************************************************************************************
4906* FPU Helpers *
4907*********************************************************************************************************************************/
4908AssertCompileSize(RTFLOAT128U, 16);
4909AssertCompileSize(RTFLOAT80U, 10);
4910AssertCompileSize(RTFLOAT64U, 8);
4911AssertCompileSize(RTFLOAT32U, 4);
4912
4913/**
4914 * Normalizes a possible pseudo-normal value.
4915 *
4916 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4917 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4918 * i.e. changing uExponent from 0 to 1.
4919 *
4920 * This macro will declare a RTFLOAT80U with the name given by
4921 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4922 * a normalization was performed.
4923 *
4924 * @note This must be applied before calling SoftFloat with a value that couldbe
4925 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4926 * correctly.
4927 */
4928#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4929 RTFLOAT80U a_r80ValNormalized; \
4930 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4931 { \
4932 a_r80ValNormalized = *a_pr80Val; \
4933 a_r80ValNormalized.s.uExponent = 1; \
4934 a_pr80Val = &a_r80ValNormalized; \
4935 } else do {} while (0)
4936
4937#ifdef IEM_WITH_FLOAT128_FOR_FPU
4938
4939DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4940{
4941 int fNew;
4942 switch (fFcw & X86_FCW_RC_MASK)
4943 {
4944 default:
4945 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4946 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4947 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4948 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4949 }
4950 int fOld = fegetround();
4951 fesetround(fNew);
4952 return fOld;
4953}
4954
4955
4956DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4957{
4958 fesetround(fOld);
4959}
4960
4961DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4962{
4963 RT_NOREF(fFcw);
4964 RTFLOAT128U Tmp;
4965 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4966 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4967 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4968 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4969 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4970 {
4971 Assert(Tmp.s.uExponent == 0);
4972 Tmp.s2.uSignAndExponent++;
4973 }
4974 return *(_Float128 *)&Tmp;
4975}
4976
4977
4978DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
4979{
4980 RT_NOREF(fFcw);
4981 RTFLOAT128U Tmp;
4982 *(_Float128 *)&Tmp = rd128ValSrc;
4983 ASMCompilerBarrier();
4984 if (RTFLOAT128U_IS_NORMAL(&Tmp))
4985 {
4986 pr80Dst->s.fSign = Tmp.s64.fSign;
4987 pr80Dst->s.uExponent = Tmp.s64.uExponent;
4988 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
4989 | Tmp.s64.uFractionLo >> (64 - 15);
4990
4991 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4992 unsigned const cShiftOff = 64 - 15;
4993 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4994 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
4995 if (uRoundedOff)
4996 {
4997 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4998 ? RT_BIT_64(cShiftOff - 1)
4999 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5000 ? fRoundingOffMask
5001 : 0;
5002 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5003 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5004 || uRoundedOff != uRoundingAdd)
5005 {
5006 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5007 {
5008 uFraction += 1;
5009 if (!(uFraction & RT_BIT_64(63)))
5010 { /* likely */ }
5011 else
5012 {
5013 uFraction >>= 1;
5014 pr80Dst->s.uExponent++;
5015 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5016 return fFsw;
5017 }
5018 fFsw |= X86_FSW_C1;
5019 }
5020 }
5021 fFsw |= X86_FSW_PE;
5022 if (!(fFcw & X86_FCW_PM))
5023 fFsw |= X86_FSW_ES | X86_FSW_B;
5024 }
5025 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5026 }
5027 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5028 {
5029 pr80Dst->s.fSign = Tmp.s64.fSign;
5030 pr80Dst->s.uExponent = 0;
5031 pr80Dst->s.uMantissa = 0;
5032 }
5033 else if (RTFLOAT128U_IS_INF(&Tmp))
5034 {
5035 pr80Dst->s.fSign = Tmp.s64.fSign;
5036 pr80Dst->s.uExponent = 0;
5037 pr80Dst->s.uMantissa = 0;
5038 }
5039 return fFsw;
5040}
5041
5042
5043#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5044
5045/** Initializer for the SoftFloat state structure. */
5046# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5047 { \
5048 softfloat_tininess_afterRounding, \
5049 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5050 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5051 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5052 : (uint8_t)softfloat_round_minMag, \
5053 0, \
5054 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5055 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5056 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5057 }
5058
5059/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5060# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5061 ( (a_fFsw) \
5062 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5063 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5064 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5065 ? X86_FSW_ES | X86_FSW_B : 0) )
5066
5067
5068DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5069{
5070 RT_NOREF(fFcw);
5071 Assert(cBits > 64);
5072# if 0 /* rounding does not seem to help */
5073 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5074 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5075 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5076 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5077 {
5078 uint64_t uOld = r128.v[0];
5079 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5080 if (r128.v[0] < uOld)
5081 r128.v[1] += 1;
5082 }
5083# else
5084 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5085# endif
5086 return r128;
5087}
5088
5089
5090DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5091{
5092 RT_NOREF(fFcw);
5093 Assert(cBits > 64);
5094# if 0 /* rounding does not seem to help, not even on constants */
5095 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5096 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5097 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5098 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5099 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5100 {
5101 uint64_t uOld = r128.v[0];
5102 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5103 if (r128.v[0] < uOld)
5104 r128.v[1] += 1;
5105 }
5106 return r128;
5107# else
5108 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5109 return r128;
5110# endif
5111}
5112
5113
5114# if 0 /* unused */
5115DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5116{
5117 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5118 return r128;
5119}
5120# endif
5121
5122
5123/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5124DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5125{
5126 extFloat80_t Tmp;
5127 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5128 Tmp.signif = pr80Val->s2.uMantissa;
5129 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5130 return extF80_to_f128(Tmp, &Ignored);
5131}
5132
5133
5134/**
5135 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5136 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5137 *
5138 * This is only a structure format conversion, nothing else.
5139 */
5140DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5141{
5142 extFloat80_t Tmp;
5143 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5144 Tmp.signif = pr80Val->s2.uMantissa;
5145 return Tmp;
5146}
5147
5148
5149/**
5150 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5151 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5152 *
5153 * This is only a structure format conversion, nothing else.
5154 */
5155DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5156{
5157 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5158 pr80Dst->s2.uMantissa = r80XSrc.signif;
5159 return pr80Dst;
5160}
5161
5162
5163DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5164{
5165 RT_NOREF(fFcw);
5166 RTFLOAT128U Tmp;
5167 *(float128_t *)&Tmp = r128Src;
5168 ASMCompilerBarrier();
5169
5170 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5171 {
5172 pr80Dst->s.fSign = Tmp.s64.fSign;
5173 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5174 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5175 | Tmp.s64.uFractionLo >> (64 - 15);
5176
5177 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5178 unsigned const cShiftOff = 64 - 15;
5179 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5180 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5181 if (uRoundedOff)
5182 {
5183 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5184 ? RT_BIT_64(cShiftOff - 1)
5185 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5186 ? fRoundingOffMask
5187 : 0;
5188 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5189 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5190 || uRoundedOff != uRoundingAdd)
5191 {
5192 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5193 {
5194 uFraction += 1;
5195 if (!(uFraction & RT_BIT_64(63)))
5196 { /* likely */ }
5197 else
5198 {
5199 uFraction >>= 1;
5200 pr80Dst->s.uExponent++;
5201 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5202 return fFsw;
5203 }
5204 fFsw |= X86_FSW_C1;
5205 }
5206 }
5207 fFsw |= X86_FSW_PE;
5208 if (!(fFcw & X86_FCW_PM))
5209 fFsw |= X86_FSW_ES | X86_FSW_B;
5210 }
5211
5212 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5213 }
5214 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5215 {
5216 pr80Dst->s.fSign = Tmp.s64.fSign;
5217 pr80Dst->s.uExponent = 0;
5218 pr80Dst->s.uMantissa = 0;
5219 }
5220 else if (RTFLOAT128U_IS_INF(&Tmp))
5221 {
5222 pr80Dst->s.fSign = Tmp.s64.fSign;
5223 pr80Dst->s.uExponent = 0;
5224 pr80Dst->s.uMantissa = 0;
5225 }
5226 return fFsw;
5227}
5228
5229
5230/**
5231 * Helper for transfering exception and C1 to FSW and setting the result value
5232 * accordingly.
5233 *
5234 * @returns Updated FSW.
5235 * @param pSoftState The SoftFloat state following the operation.
5236 * @param r80XResult The result of the SoftFloat operation.
5237 * @param pr80Result Where to store the result for IEM.
5238 * @param fFcw The FPU control word.
5239 * @param fFsw The FSW before the operation, with necessary bits
5240 * cleared and such.
5241 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5242 * raised.
5243 */
5244DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5245 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5246 PCRTFLOAT80U pr80XcptResult)
5247{
5248 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5249 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5250 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5251 fFsw |= X86_FSW_ES | X86_FSW_B;
5252
5253 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5254 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5255 else
5256 {
5257 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5258 *pr80Result = *pr80XcptResult;
5259 }
5260 return fFsw;
5261}
5262
5263
5264/**
5265 * Helper doing polynomial evaluation using Horner's method.
5266 *
5267 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5268 */
5269float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5270 unsigned cPrecision, softfloat_state_t *pSoftState)
5271{
5272 Assert(cHornerConsts > 1);
5273 size_t i = cHornerConsts - 1;
5274 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5275 while (i-- > 0)
5276 {
5277 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5278 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5279 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5280 }
5281 return r128Result;
5282}
5283
5284#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5285
5286
5287/**
5288 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5289 * mantissa, exponent and sign.
5290 *
5291 * @returns Updated FSW.
5292 * @param pr80Dst Where to return the composed value.
5293 * @param fSign The sign.
5294 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5295 * ignored and should be zero. This will probably be
5296 * modified during normalization and rounding.
5297 * @param iExponent Unbiased exponent.
5298 * @param fFcw The FPU control word.
5299 * @param fFsw The FPU status word.
5300 */
5301static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5302 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5303{
5304 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5305
5306 iExponent += RTFLOAT80U_EXP_BIAS;
5307
5308 /* Do normalization if necessary and possible. */
5309 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5310 {
5311 int cShift = 192 - RTUInt256BitCount(puMantissa);
5312 if (iExponent > cShift)
5313 iExponent -= cShift;
5314 else
5315 {
5316 if (fFcw & X86_FCW_UM)
5317 {
5318 if (iExponent > 0)
5319 cShift = --iExponent;
5320 else
5321 cShift = 0;
5322 }
5323 iExponent -= cShift;
5324 }
5325 RTUInt256AssignShiftLeft(puMantissa, cShift);
5326 }
5327
5328 /* Do rounding. */
5329 uint64_t uMantissa = puMantissa->QWords.qw2;
5330 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5331 {
5332 bool fAdd;
5333 switch (fFcw & X86_FCW_RC_MASK)
5334 {
5335 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5336 case X86_FCW_RC_NEAREST:
5337 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5338 {
5339 if ( (uMantissa & 1)
5340 || puMantissa->QWords.qw0 != 0
5341 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5342 {
5343 fAdd = true;
5344 break;
5345 }
5346 uMantissa &= ~(uint64_t)1;
5347 }
5348 fAdd = false;
5349 break;
5350 case X86_FCW_RC_ZERO:
5351 fAdd = false;
5352 break;
5353 case X86_FCW_RC_UP:
5354 fAdd = !fSign;
5355 break;
5356 case X86_FCW_RC_DOWN:
5357 fAdd = fSign;
5358 break;
5359 }
5360 if (fAdd)
5361 {
5362 uint64_t const uTmp = uMantissa;
5363 uMantissa = uTmp + 1;
5364 if (uMantissa < uTmp)
5365 {
5366 uMantissa >>= 1;
5367 uMantissa |= RT_BIT_64(63);
5368 iExponent++;
5369 }
5370 fFsw |= X86_FSW_C1;
5371 }
5372 fFsw |= X86_FSW_PE;
5373 if (!(fFcw & X86_FCW_PM))
5374 fFsw |= X86_FSW_ES | X86_FSW_B;
5375 }
5376
5377 /* Check for underflow (denormals). */
5378 if (iExponent <= 0)
5379 {
5380 if (fFcw & X86_FCW_UM)
5381 {
5382 if (uMantissa & RT_BIT_64(63))
5383 uMantissa >>= 1;
5384 iExponent = 0;
5385 }
5386 else
5387 {
5388 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5389 fFsw |= X86_FSW_ES | X86_FSW_B;
5390 }
5391 fFsw |= X86_FSW_UE;
5392 }
5393 /* Check for overflow */
5394 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5395 {
5396 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5397 }
5398
5399 /* Compose the result. */
5400 pr80Dst->s.uMantissa = uMantissa;
5401 pr80Dst->s.uExponent = iExponent;
5402 pr80Dst->s.fSign = fSign;
5403 return fFsw;
5404}
5405
5406
5407/**
5408 * See also iemAImpl_fld_r80_from_r32
5409 */
5410static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5411{
5412 uint16_t fFsw = 0;
5413 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5414 {
5415 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5416 pr80Dst->sj64.fInteger = 1;
5417 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5418 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5419 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5420 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5421 }
5422 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5423 {
5424 pr80Dst->s.fSign = pr32Val->s.fSign;
5425 pr80Dst->s.uExponent = 0;
5426 pr80Dst->s.uMantissa = 0;
5427 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5428 }
5429 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5430 {
5431 /* Subnormal -> normalized + X86_FSW_DE return. */
5432 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5433 pr80Dst->sj64.fInteger = 1;
5434 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5435 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5436 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5437 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5438 fFsw = X86_FSW_DE;
5439 }
5440 else if (RTFLOAT32U_IS_INF(pr32Val))
5441 {
5442 pr80Dst->s.fSign = pr32Val->s.fSign;
5443 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5444 pr80Dst->s.uMantissa = RT_BIT_64(63);
5445 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5446 }
5447 else
5448 {
5449 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5450 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5451 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5452 pr80Dst->sj64.fInteger = 1;
5453 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5454 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5455 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5456 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5457 }
5458 return fFsw;
5459}
5460
5461
5462/**
5463 * See also iemAImpl_fld_r80_from_r64
5464 */
5465static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5466{
5467 uint16_t fFsw = 0;
5468 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5469 {
5470 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5471 pr80Dst->sj64.fInteger = 1;
5472 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5473 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5474 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5475 }
5476 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5477 {
5478 pr80Dst->s.fSign = pr64Val->s.fSign;
5479 pr80Dst->s.uExponent = 0;
5480 pr80Dst->s.uMantissa = 0;
5481 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5482 }
5483 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5484 {
5485 /* Subnormal values gets normalized. */
5486 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5487 pr80Dst->sj64.fInteger = 1;
5488 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5489 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5490 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5491 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5492 fFsw = X86_FSW_DE;
5493 }
5494 else if (RTFLOAT64U_IS_INF(pr64Val))
5495 {
5496 pr80Dst->s.fSign = pr64Val->s.fSign;
5497 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5498 pr80Dst->s.uMantissa = RT_BIT_64(63);
5499 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5500 }
5501 else
5502 {
5503 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5504 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5505 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5506 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5507 pr80Dst->sj64.fInteger = 1;
5508 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5509 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5510 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5511 }
5512 return fFsw;
5513}
5514
5515
5516/**
5517 * See also EMIT_FILD.
5518 */
5519#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5520static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5521{ \
5522 if (iVal == 0) \
5523 { \
5524 pr80Dst->s.fSign = 0; \
5525 pr80Dst->s.uExponent = 0; \
5526 pr80Dst->s.uMantissa = 0; \
5527 } \
5528 else \
5529 { \
5530 if (iVal > 0) \
5531 pr80Dst->s.fSign = 0; \
5532 else \
5533 { \
5534 pr80Dst->s.fSign = 1; \
5535 iVal = -iVal; \
5536 } \
5537 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5538 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5539 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5540 } \
5541 return pr80Dst; \
5542}
5543EMIT_CONVERT_IXX_TO_R80(16)
5544EMIT_CONVERT_IXX_TO_R80(32)
5545//EMIT_CONVERT_IXX_TO_R80(64)
5546
5547/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5548#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5549IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5550{ \
5551 RTFLOAT80U r80Val2; \
5552 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5553 Assert(!fFsw || fFsw == X86_FSW_DE); \
5554 if (fFsw) \
5555 { \
5556 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5557 fFsw = 0; \
5558 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5559 { \
5560 pFpuRes->r80Result = *pr80Val1; \
5561 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5562 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5563 return; \
5564 } \
5565 } \
5566 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5567 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5568}
5569
5570/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5571#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5572IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5573{ \
5574 RTFLOAT80U r80Val2; \
5575 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5576 Assert(!fFsw || fFsw == X86_FSW_DE); \
5577 if (fFsw) \
5578 { \
5579 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5580 fFsw = 0; \
5581 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5582 { \
5583 pFpuRes->r80Result = *pr80Val1; \
5584 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5585 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5586 return; \
5587 } \
5588 } \
5589 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5590 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5591}
5592
5593/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5594#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5595IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5596{ \
5597 RTFLOAT80U r80Val2; \
5598 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5599 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5600}
5601
5602/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5603#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5604IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5605{ \
5606 RTFLOAT80U r80Val2; \
5607 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5608 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5609}
5610
5611
5612
5613/*********************************************************************************************************************************
5614* x86 FPU Division Operations *
5615*********************************************************************************************************************************/
5616
5617/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5618static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5619 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5620{
5621 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5622 {
5623 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5624 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5625 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5626 }
5627 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5628 { /* Div by zero. */
5629 if (fFcw & X86_FCW_ZM)
5630 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5631 else
5632 {
5633 *pr80Result = *pr80Val1Org;
5634 fFsw |= X86_FSW_ES | X86_FSW_B;
5635 }
5636 fFsw |= X86_FSW_ZE;
5637 }
5638 else
5639 { /* Invalid operand */
5640 if (fFcw & X86_FCW_IM)
5641 *pr80Result = g_r80Indefinite;
5642 else
5643 {
5644 *pr80Result = *pr80Val1Org;
5645 fFsw |= X86_FSW_ES | X86_FSW_B;
5646 }
5647 fFsw |= X86_FSW_IE;
5648 }
5649 return fFsw;
5650}
5651
5652
5653IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5654 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5655{
5656 uint16_t const fFcw = pFpuState->FCW;
5657 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5658
5659 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5660 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5661 {
5662 if (fFcw & X86_FCW_IM)
5663 pFpuRes->r80Result = g_r80Indefinite;
5664 else
5665 {
5666 pFpuRes->r80Result = *pr80Val1;
5667 fFsw |= X86_FSW_ES | X86_FSW_B;
5668 }
5669 fFsw |= X86_FSW_IE;
5670 }
5671 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5672 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5673 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5674 {
5675 if (fFcw & X86_FCW_DM)
5676 {
5677 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5678 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5679 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5680 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5681 }
5682 else
5683 {
5684 pFpuRes->r80Result = *pr80Val1;
5685 fFsw |= X86_FSW_ES | X86_FSW_B;
5686 }
5687 fFsw |= X86_FSW_DE;
5688 }
5689 /* SoftFloat can handle the rest: */
5690 else
5691 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5692
5693 pFpuRes->FSW = fFsw;
5694}
5695
5696
5697EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5698EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5699EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5700EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5701
5702
5703IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5704 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5705{
5706 uint16_t const fFcw = pFpuState->FCW;
5707 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5708
5709 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5710 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5711 {
5712 if (fFcw & X86_FCW_IM)
5713 pFpuRes->r80Result = g_r80Indefinite;
5714 else
5715 {
5716 pFpuRes->r80Result = *pr80Val1;
5717 fFsw |= X86_FSW_ES | X86_FSW_B;
5718 }
5719 fFsw |= X86_FSW_IE;
5720 }
5721 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5722 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5723 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5724 {
5725 if (fFcw & X86_FCW_DM)
5726 {
5727 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5728 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5729 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5730 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5731 }
5732 else
5733 {
5734 pFpuRes->r80Result = *pr80Val1;
5735 fFsw |= X86_FSW_ES | X86_FSW_B;
5736 }
5737 fFsw |= X86_FSW_DE;
5738 }
5739 /* SoftFloat can handle the rest: */
5740 else
5741 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5742
5743 pFpuRes->FSW = fFsw;
5744}
5745
5746
5747EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5748EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5749EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5750EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5751
5752
5753/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5754static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5755 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5756{
5757 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5758 {
5759 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5760 uint16_t fCxFlags = 0;
5761 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5762 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5763 &fCxFlags, &SoftState);
5764 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5765 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5766 if ( !(fFsw & X86_FSW_IE)
5767 && !RTFLOAT80U_IS_NAN(pr80Result)
5768 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5769 {
5770 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5771 fFsw |= fCxFlags & X86_FSW_C_MASK;
5772 }
5773 return fFsw;
5774 }
5775
5776 /* Invalid operand */
5777 if (fFcw & X86_FCW_IM)
5778 *pr80Result = g_r80Indefinite;
5779 else
5780 {
5781 *pr80Result = *pr80Val1Org;
5782 fFsw |= X86_FSW_ES | X86_FSW_B;
5783 }
5784 return fFsw | X86_FSW_IE;
5785}
5786
5787
5788static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5789 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5790{
5791 uint16_t const fFcw = pFpuState->FCW;
5792 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5793
5794 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5795 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5796 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5797 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5798 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5799 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5800 {
5801 if (fFcw & X86_FCW_IM)
5802 pFpuRes->r80Result = g_r80Indefinite;
5803 else
5804 {
5805 pFpuRes->r80Result = *pr80Val1;
5806 fFsw |= X86_FSW_ES | X86_FSW_B;
5807 }
5808 fFsw |= X86_FSW_IE;
5809 }
5810 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5811 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5812 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5813 {
5814 if (fFcw & X86_FCW_DM)
5815 {
5816 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5817 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5818 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5819 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5820 pr80Val1Org, fLegacyInstr);
5821 }
5822 else
5823 {
5824 pFpuRes->r80Result = *pr80Val1;
5825 fFsw |= X86_FSW_ES | X86_FSW_B;
5826 }
5827 fFsw |= X86_FSW_DE;
5828 }
5829 /* SoftFloat can handle the rest: */
5830 else
5831 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5832 pr80Val1, fLegacyInstr);
5833
5834 pFpuRes->FSW = fFsw;
5835}
5836
5837
5838IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5839 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5840{
5841 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5842}
5843
5844
5845IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5846 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5847{
5848 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5849}
5850
5851
5852/*********************************************************************************************************************************
5853* x87 FPU Multiplication Operations *
5854*********************************************************************************************************************************/
5855
5856/** Worker for iemAImpl_fmul_r80_by_r80. */
5857static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5858 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5859{
5860 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5861 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5862 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5863}
5864
5865
5866IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5867 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5868{
5869 uint16_t const fFcw = pFpuState->FCW;
5870 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5871
5872 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5873 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5874 {
5875 if (fFcw & X86_FCW_IM)
5876 pFpuRes->r80Result = g_r80Indefinite;
5877 else
5878 {
5879 pFpuRes->r80Result = *pr80Val1;
5880 fFsw |= X86_FSW_ES | X86_FSW_B;
5881 }
5882 fFsw |= X86_FSW_IE;
5883 }
5884 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5885 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5886 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5887 {
5888 if (fFcw & X86_FCW_DM)
5889 {
5890 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5891 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5892 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5893 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5894 }
5895 else
5896 {
5897 pFpuRes->r80Result = *pr80Val1;
5898 fFsw |= X86_FSW_ES | X86_FSW_B;
5899 }
5900 fFsw |= X86_FSW_DE;
5901 }
5902 /* SoftFloat can handle the rest: */
5903 else
5904 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5905
5906 pFpuRes->FSW = fFsw;
5907}
5908
5909
5910EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5911EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5912EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5913EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5914
5915
5916/*********************************************************************************************************************************
5917* x87 FPU Addition *
5918*********************************************************************************************************************************/
5919
5920/** Worker for iemAImpl_fadd_r80_by_r80. */
5921static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5922 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5923{
5924 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5925 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5926 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5927}
5928
5929
5930IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5931 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5932{
5933 uint16_t const fFcw = pFpuState->FCW;
5934 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5935
5936 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5937 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5938 {
5939 if (fFcw & X86_FCW_IM)
5940 pFpuRes->r80Result = g_r80Indefinite;
5941 else
5942 {
5943 pFpuRes->r80Result = *pr80Val1;
5944 fFsw |= X86_FSW_ES | X86_FSW_B;
5945 }
5946 fFsw |= X86_FSW_IE;
5947 }
5948 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5949 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5950 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5951 {
5952 if (fFcw & X86_FCW_DM)
5953 {
5954 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5955 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5956 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5957 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5958 }
5959 else
5960 {
5961 pFpuRes->r80Result = *pr80Val1;
5962 fFsw |= X86_FSW_ES | X86_FSW_B;
5963 }
5964 fFsw |= X86_FSW_DE;
5965 }
5966 /* SoftFloat can handle the rest: */
5967 else
5968 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5969
5970 pFpuRes->FSW = fFsw;
5971}
5972
5973
5974EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
5975EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
5976EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
5977EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
5978
5979
5980/*********************************************************************************************************************************
5981* x87 FPU Subtraction *
5982*********************************************************************************************************************************/
5983
5984/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
5985static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5986 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5987{
5988 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5989 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5990 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5991}
5992
5993
5994IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5995 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5996{
5997 uint16_t const fFcw = pFpuState->FCW;
5998 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5999
6000 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6001 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6002 {
6003 if (fFcw & X86_FCW_IM)
6004 pFpuRes->r80Result = g_r80Indefinite;
6005 else
6006 {
6007 pFpuRes->r80Result = *pr80Val1;
6008 fFsw |= X86_FSW_ES | X86_FSW_B;
6009 }
6010 fFsw |= X86_FSW_IE;
6011 }
6012 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6013 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6014 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6015 {
6016 if (fFcw & X86_FCW_DM)
6017 {
6018 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6019 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6020 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6021 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6022 }
6023 else
6024 {
6025 pFpuRes->r80Result = *pr80Val1;
6026 fFsw |= X86_FSW_ES | X86_FSW_B;
6027 }
6028 fFsw |= X86_FSW_DE;
6029 }
6030 /* SoftFloat can handle the rest: */
6031 else
6032 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6033
6034 pFpuRes->FSW = fFsw;
6035}
6036
6037
6038EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6039EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6040EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6041EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6042
6043
6044/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6045IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6046 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6047{
6048 uint16_t const fFcw = pFpuState->FCW;
6049 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6050
6051 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6052 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6053 {
6054 if (fFcw & X86_FCW_IM)
6055 pFpuRes->r80Result = g_r80Indefinite;
6056 else
6057 {
6058 pFpuRes->r80Result = *pr80Val1;
6059 fFsw |= X86_FSW_ES | X86_FSW_B;
6060 }
6061 fFsw |= X86_FSW_IE;
6062 }
6063 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6064 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6065 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6066 {
6067 if (fFcw & X86_FCW_DM)
6068 {
6069 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6070 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6071 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6072 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6073 }
6074 else
6075 {
6076 pFpuRes->r80Result = *pr80Val1;
6077 fFsw |= X86_FSW_ES | X86_FSW_B;
6078 }
6079 fFsw |= X86_FSW_DE;
6080 }
6081 /* SoftFloat can handle the rest: */
6082 else
6083 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6084
6085 pFpuRes->FSW = fFsw;
6086}
6087
6088
6089EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6090EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6091EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6092EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6093
6094
6095/*********************************************************************************************************************************
6096* x87 FPU Trigometric Operations *
6097*********************************************************************************************************************************/
6098
6099
6100IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6101 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6102{
6103 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6104 AssertReleaseFailed();
6105}
6106
6107#endif /* IEM_WITHOUT_ASSEMBLY */
6108
6109IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6110 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6111{
6112 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6113}
6114
6115IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6116 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6117{
6118 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6119}
6120
6121
6122#if defined(IEM_WITHOUT_ASSEMBLY)
6123IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6124{
6125 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6126 AssertReleaseFailed();
6127}
6128#endif /* IEM_WITHOUT_ASSEMBLY */
6129
6130IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6131{
6132 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6133}
6134
6135IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6136{
6137 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6138}
6139
6140
6141#ifdef IEM_WITHOUT_ASSEMBLY
6142IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6143{
6144 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6145 AssertReleaseFailed();
6146}
6147#endif /* IEM_WITHOUT_ASSEMBLY */
6148
6149IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6150{
6151 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6152}
6153
6154IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6155{
6156 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6157}
6158
6159#ifdef IEM_WITHOUT_ASSEMBLY
6160IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6161{
6162 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6163 AssertReleaseFailed();
6164}
6165#endif /* IEM_WITHOUT_ASSEMBLY */
6166
6167IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6168{
6169 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6170}
6171
6172IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6173{
6174 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6175}
6176
6177
6178#ifdef IEM_WITHOUT_ASSEMBLY
6179IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6180{
6181 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6182 AssertReleaseFailed();
6183}
6184#endif /* IEM_WITHOUT_ASSEMBLY */
6185
6186IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6187{
6188 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6189}
6190
6191IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6192{
6193 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6194}
6195
6196#ifdef IEM_WITHOUT_ASSEMBLY
6197
6198
6199/*********************************************************************************************************************************
6200* x87 FPU Compare and Testing Operations *
6201*********************************************************************************************************************************/
6202
6203IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6204{
6205 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6206
6207 if (RTFLOAT80U_IS_ZERO(pr80Val))
6208 fFsw |= X86_FSW_C3;
6209 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6210 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6211 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6212 {
6213 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6214 if (!(pFpuState->FCW & X86_FCW_DM))
6215 fFsw |= X86_FSW_ES | X86_FSW_B;
6216 }
6217 else
6218 {
6219 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6220 if (!(pFpuState->FCW & X86_FCW_IM))
6221 fFsw |= X86_FSW_ES | X86_FSW_B;
6222 }
6223
6224 *pu16Fsw = fFsw;
6225}
6226
6227
6228IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6229{
6230 RT_NOREF(pFpuState);
6231 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6232
6233 /* C1 = sign bit (always, even if empty Intel says). */
6234 if (pr80Val->s.fSign)
6235 fFsw |= X86_FSW_C1;
6236
6237 /* Classify the value in C0, C2, C3. */
6238 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6239 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6240 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6241 fFsw |= X86_FSW_C2;
6242 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6243 fFsw |= X86_FSW_C3;
6244 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6245 fFsw |= X86_FSW_C0;
6246 else if (RTFLOAT80U_IS_INF(pr80Val))
6247 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6248 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6249 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6250 /* whatever else: 0 */
6251
6252 *pu16Fsw = fFsw;
6253}
6254
6255
6256/**
6257 * Worker for fcom, fucom, and friends.
6258 */
6259static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6260 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6261{
6262 /*
6263 * Unpack the values.
6264 */
6265 bool const fSign1 = pr80Val1->s.fSign;
6266 int32_t iExponent1 = pr80Val1->s.uExponent;
6267 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6268
6269 bool const fSign2 = pr80Val2->s.fSign;
6270 int32_t iExponent2 = pr80Val2->s.uExponent;
6271 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6272
6273 /*
6274 * Check for invalid inputs.
6275 */
6276 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6277 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6278 {
6279 if (!(fFcw & X86_FCW_IM))
6280 fFsw |= X86_FSW_ES | X86_FSW_B;
6281 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6282 }
6283
6284 /*
6285 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6286 */
6287 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6288 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6289 {
6290 if ( fIeOnAllNaNs
6291 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6292 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6293 {
6294 fFsw |= X86_FSW_IE;
6295 if (!(fFcw & X86_FCW_IM))
6296 fFsw |= X86_FSW_ES | X86_FSW_B;
6297 }
6298 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6299 }
6300
6301 /*
6302 * Normalize the values.
6303 */
6304 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6305 {
6306 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6307 iExponent1 = 1;
6308 else
6309 {
6310 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6311 uMantissa1 <<= iExponent1;
6312 iExponent1 = 1 - iExponent1;
6313 }
6314 fFsw |= X86_FSW_DE;
6315 if (!(fFcw & X86_FCW_DM))
6316 fFsw |= X86_FSW_ES | X86_FSW_B;
6317 }
6318
6319 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6320 {
6321 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6322 iExponent2 = 1;
6323 else
6324 {
6325 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6326 uMantissa2 <<= iExponent2;
6327 iExponent2 = 1 - iExponent2;
6328 }
6329 fFsw |= X86_FSW_DE;
6330 if (!(fFcw & X86_FCW_DM))
6331 fFsw |= X86_FSW_ES | X86_FSW_B;
6332 }
6333
6334 /*
6335 * Test if equal (val1 == val2):
6336 */
6337 if ( uMantissa1 == uMantissa2
6338 && iExponent1 == iExponent2
6339 && ( fSign1 == fSign2
6340 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6341 fFsw |= X86_FSW_C3;
6342 /*
6343 * Test if less than (val1 < val2):
6344 */
6345 else if (fSign1 && !fSign2)
6346 fFsw |= X86_FSW_C0;
6347 else if (fSign1 == fSign2)
6348 {
6349 /* Zeros are problematic, however at the most one can be zero here. */
6350 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6351 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6352 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6353 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6354
6355 if ( fSign1
6356 ^ ( iExponent1 < iExponent2
6357 || ( iExponent1 == iExponent2
6358 && uMantissa1 < uMantissa2 ) ) )
6359 fFsw |= X86_FSW_C0;
6360 }
6361 /* else: No flags set if greater. */
6362
6363 return fFsw;
6364}
6365
6366
6367IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6368 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6369{
6370 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6371}
6372
6373
6374
6375
6376IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6377 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6378{
6379 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6380}
6381
6382
6383IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6384 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6385{
6386 RTFLOAT80U r80Val2;
6387 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6388 Assert(!fFsw || fFsw == X86_FSW_DE);
6389 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6390 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6391 {
6392 if (!(pFpuState->FCW & X86_FCW_DM))
6393 fFsw |= X86_FSW_ES | X86_FSW_B;
6394 *pfFsw |= fFsw;
6395 }
6396}
6397
6398
6399IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6400 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6401{
6402 RTFLOAT80U r80Val2;
6403 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6404 Assert(!fFsw || fFsw == X86_FSW_DE);
6405 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6406 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6407 {
6408 if (!(pFpuState->FCW & X86_FCW_DM))
6409 fFsw |= X86_FSW_ES | X86_FSW_B;
6410 *pfFsw |= fFsw;
6411 }
6412}
6413
6414
6415IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6416 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6417{
6418 RTFLOAT80U r80Val2;
6419 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6420 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6421}
6422
6423
6424IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6425 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6426{
6427 RTFLOAT80U r80Val2;
6428 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6429 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6430}
6431
6432
6433/**
6434 * Worker for fcomi & fucomi.
6435 */
6436static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6437 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6438{
6439 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6440 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6441 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6442 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6443
6444 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6445 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6446 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6447}
6448
6449
6450IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6451 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6452{
6453 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6454}
6455
6456
6457IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6458 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6459{
6460 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6461}
6462
6463
6464/*********************************************************************************************************************************
6465* x87 FPU Other Operations *
6466*********************************************************************************************************************************/
6467
6468/**
6469 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6470 */
6471static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6472{
6473 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6474 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6475 true /*exact / generate #PE */, &SoftState));
6476 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6477}
6478
6479
6480IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6481{
6482 uint16_t const fFcw = pFpuState->FCW;
6483 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6484
6485 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6486 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6487 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6488 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6489 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6490 || RTFLOAT80U_IS_INF(pr80Val))
6491 pFpuRes->r80Result = *pr80Val;
6492 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6493 {
6494 fFsw |= X86_FSW_DE;
6495 if (fFcw & X86_FCW_DM)
6496 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6497 else
6498 {
6499 pFpuRes->r80Result = *pr80Val;
6500 fFsw |= X86_FSW_ES | X86_FSW_B;
6501 }
6502 }
6503 else
6504 {
6505 if (fFcw & X86_FCW_IM)
6506 {
6507 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6508 pFpuRes->r80Result = g_r80Indefinite;
6509 else
6510 {
6511 pFpuRes->r80Result = *pr80Val;
6512 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6513 }
6514 }
6515 else
6516 {
6517 pFpuRes->r80Result = *pr80Val;
6518 fFsw |= X86_FSW_ES | X86_FSW_B;
6519 }
6520 fFsw |= X86_FSW_IE;
6521 }
6522 pFpuRes->FSW = fFsw;
6523}
6524
6525
6526IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6527 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6528{
6529 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6530 it does everything we need it to do. */
6531 uint16_t const fFcw = pFpuState->FCW;
6532 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6533 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6534 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6535 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6536}
6537
6538
6539/**
6540 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6541 */
6542static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6543{
6544 Assert(!pr80Val->s.fSign);
6545 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6546 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6547 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6548}
6549
6550
6551IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6552{
6553 uint16_t const fFcw = pFpuState->FCW;
6554 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6555
6556 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6557 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6558 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6559 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6560 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6561 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6562 pFpuRes->r80Result = *pr80Val;
6563 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6564 {
6565 fFsw |= X86_FSW_DE;
6566 if (fFcw & X86_FCW_DM)
6567 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6568 else
6569 {
6570 pFpuRes->r80Result = *pr80Val;
6571 fFsw |= X86_FSW_ES | X86_FSW_B;
6572 }
6573 }
6574 else
6575 {
6576 if (fFcw & X86_FCW_IM)
6577 {
6578 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6579 pFpuRes->r80Result = g_r80Indefinite;
6580 else
6581 {
6582 pFpuRes->r80Result = *pr80Val;
6583 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6584 }
6585 }
6586 else
6587 {
6588 pFpuRes->r80Result = *pr80Val;
6589 fFsw |= X86_FSW_ES | X86_FSW_B;
6590 }
6591 fFsw |= X86_FSW_IE;
6592 }
6593 pFpuRes->FSW = fFsw;
6594}
6595
6596
6597/**
6598 * @code{.unparsed}
6599 * x x * ln2
6600 * f(x) = 2 - 1 = e - 1
6601 *
6602 * @endcode
6603 *
6604 * We can approximate e^x by a Taylor/Maclaurin series (see
6605 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
6606 * @code{.unparsed}
6607 * n 0 1 2 3 4
6608 * inf x x x x x x
6609 * SUM ----- = --- + --- + --- + --- + --- + ...
6610 * n=0 n! 0! 1! 2! 3! 4!
6611 *
6612 * 2 3 4
6613 * x x x
6614 * = 1 + x + --- + --- + --- + ...
6615 * 2! 3! 4!
6616 * @endcode
6617 *
6618 * Given z = x * ln2, we get:
6619 * @code{.unparsed}
6620 * 2 3 4 n
6621 * z z z z z
6622 * e - 1 = z + --- + --- + --- + ... + ---
6623 * 2! 3! 4! n!
6624 * @endcode
6625 *
6626 * Wanting to use Horner's method, we move one z outside and get:
6627 * @code{.unparsed}
6628 * 2 3 (n-1)
6629 * z z z z
6630 * = z ( 1 + --- + --- + --- + ... + ------- )
6631 * 2! 3! 4! n!
6632 * @endcode
6633 *
6634 * The constants we need for using Horner's methods are 1 and 1 / n!.
6635 *
6636 * For very tiny x values, we can get away with f(x) = x * ln 2, because
6637 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
6638 * and can approximate it to be 1.0. For a visual demonstration of this
6639 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
6640 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
6641 *
6642 *
6643 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
6644 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
6645 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
6646 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
6647 * blocks). (The one bit difference is probably an implicit one missing from
6648 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
6649 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
6650 * exponent.
6651 *
6652 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
6653 * successfully reproduced the exact results from an Intel 10980XE, there is
6654 * always a portition of rounding differences. Not going to spend too much time
6655 * on getting this 100% the same, at least not now.
6656 *
6657 * P.S. If someone are really curious about 8087 and its contstants:
6658 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
6659 *
6660 *
6661 * @param pr80Val The exponent value (x), less than 1.0, greater than
6662 * -1.0 and not zero. This can be a normal, denormal
6663 * or pseudo-denormal value.
6664 * @param pr80Result Where to return the result.
6665 * @param fFcw FPU control word.
6666 * @param fFsw FPU status word.
6667 */
6668static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6669{
6670 /* As mentioned above, we can skip the expensive polynomial calculation
6671 as it will be close enough to 1.0 that it makes no difference.
6672
6673 The cutoff point for intel 10980XE is exponents >= -69. Intel
6674 also seems to be using a 67-bit or 68-bit constant value, and we get
6675 a smattering of rounding differences if we go for higher precision. */
6676 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
6677 {
6678 RTUINT256U u256;
6679 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
6680 u256.QWords.qw0 |= 1; /* force #PE */
6681 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
6682 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
6683 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
6684 : 1 - RTFLOAT80U_EXP_BIAS,
6685 fFcw, fFsw);
6686 }
6687 else
6688 {
6689#ifdef IEM_WITH_FLOAT128_FOR_FPU
6690 /* This approach is not good enough for small values, we end up with zero. */
6691 int const fOldRounding = iemFpuF128SetRounding(fFcw);
6692 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
6693 _Float128 rd128Result = powf128(2.0L, rd128Val);
6694 rd128Result -= 1.0L;
6695 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
6696 iemFpuF128RestoreRounding(fOldRounding);
6697
6698# else
6699 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6700 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
6701
6702 /* As mentioned above, enforce 68-bit internal mantissa width to better
6703 match the Intel 10980XE results. */
6704 unsigned const cPrecision = 68;
6705
6706 /* first calculate z = x * ln2 */
6707 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
6708 cPrecision);
6709
6710 /* Then do the polynomial evaluation. */
6711 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
6712 cPrecision, &SoftState);
6713 r = f128_mul(z, r, &SoftState);
6714
6715 /* Output the result. */
6716 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
6717# endif
6718 }
6719 return fFsw;
6720}
6721
6722
6723IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6724{
6725 uint16_t const fFcw = pFpuState->FCW;
6726 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6727
6728 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6729 {
6730 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
6731 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6732 else
6733 {
6734 /* Special case:
6735 2^+1.0 - 1.0 = 1.0
6736 2^-1.0 - 1.0 = -0.5 */
6737 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
6738 && pr80Val->s.uMantissa == RT_BIT_64(63))
6739 {
6740 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
6741 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
6742 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6743 }
6744 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
6745 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
6746 else
6747 pFpuRes->r80Result = *pr80Val;
6748 fFsw |= X86_FSW_PE;
6749 if (!(fFcw & X86_FCW_PM))
6750 fFsw |= X86_FSW_ES | X86_FSW_B;
6751 }
6752 }
6753 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6754 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6755 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6756 pFpuRes->r80Result = *pr80Val;
6757 else if (RTFLOAT80U_IS_INF(pr80Val))
6758 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
6759 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6760 {
6761 fFsw |= X86_FSW_DE;
6762 if (fFcw & X86_FCW_DM)
6763 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6764 else
6765 {
6766 pFpuRes->r80Result = *pr80Val;
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 }
6770 else
6771 {
6772 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6773 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6774 && (fFcw & X86_FCW_IM))
6775 pFpuRes->r80Result = g_r80Indefinite;
6776 else
6777 {
6778 pFpuRes->r80Result = *pr80Val;
6779 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6780 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6781 }
6782 fFsw |= X86_FSW_IE;
6783 if (!(fFcw & X86_FCW_IM))
6784 fFsw |= X86_FSW_ES | X86_FSW_B;
6785 }
6786 pFpuRes->FSW = fFsw;
6787}
6788
6789#endif /* IEM_WITHOUT_ASSEMBLY */
6790
6791IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6792{
6793 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6794}
6795
6796IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6797{
6798 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6799}
6800
6801#ifdef IEM_WITHOUT_ASSEMBLY
6802
6803IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6804{
6805 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6806 pFpuRes->r80Result = *pr80Val;
6807 pFpuRes->r80Result.s.fSign = 0;
6808}
6809
6810
6811IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6812{
6813 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6814 pFpuRes->r80Result = *pr80Val;
6815 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
6816}
6817
6818
6819IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6820{
6821 uint16_t const fFcw = pFpuState->FCW;
6822 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6823
6824 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6825 {
6826 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6827 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
6828
6829 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6830 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6831 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6832 }
6833 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6834 {
6835 fFsw |= X86_FSW_ZE;
6836 if (fFcw & X86_FCW_ZM)
6837 {
6838 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
6839 pFpuResTwo->r80Result2 = *pr80Val;
6840 }
6841 else
6842 {
6843 pFpuResTwo->r80Result2 = *pr80Val;
6844 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6845 }
6846 }
6847 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6848 {
6849 fFsw |= X86_FSW_DE;
6850 if (fFcw & X86_FCW_DM)
6851 {
6852 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6853 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6854 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6855 int32_t iExponent = -16382;
6856 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
6857 {
6858 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
6859 iExponent--;
6860 }
6861
6862 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6863 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
6864 }
6865 else
6866 {
6867 pFpuResTwo->r80Result2 = *pr80Val;
6868 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6869 }
6870 }
6871 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6872 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6873 {
6874 pFpuResTwo->r80Result1 = *pr80Val;
6875 pFpuResTwo->r80Result2 = *pr80Val;
6876 }
6877 else if (RTFLOAT80U_IS_INF(pr80Val))
6878 {
6879 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
6880 pFpuResTwo->r80Result2 = *pr80Val;
6881 }
6882 else
6883 {
6884 if (fFcw & X86_FCW_IM)
6885 {
6886 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6887 pFpuResTwo->r80Result1 = g_r80Indefinite;
6888 else
6889 {
6890 pFpuResTwo->r80Result1 = *pr80Val;
6891 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6892 }
6893 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
6894 }
6895 else
6896 {
6897 pFpuResTwo->r80Result2 = *pr80Val;
6898 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6899 }
6900 fFsw |= X86_FSW_IE;
6901 }
6902 pFpuResTwo->FSW = fFsw;
6903}
6904
6905
6906IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6907 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6908{
6909 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6910 AssertReleaseFailed();
6911}
6912
6913#endif /* IEM_WITHOUT_ASSEMBLY */
6914
6915IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6916 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6917{
6918 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6919}
6920
6921IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6922 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6923{
6924 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6925}
6926
6927#if defined(IEM_WITHOUT_ASSEMBLY)
6928
6929IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6930 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6931{
6932 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6933 AssertReleaseFailed();
6934}
6935
6936#endif /* IEM_WITHOUT_ASSEMBLY */
6937
6938IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6939 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6940{
6941 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6942}
6943
6944IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6945 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6946{
6947 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6948}
6949
6950
6951/*********************************************************************************************************************************
6952* MMX, SSE & AVX *
6953*********************************************************************************************************************************/
6954
6955IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
6956{
6957 RT_NOREF(pFpuState);
6958 puDst->au32[0] = puSrc->au32[0];
6959 puDst->au32[1] = puSrc->au32[0];
6960 puDst->au32[2] = puSrc->au32[2];
6961 puDst->au32[3] = puSrc->au32[2];
6962}
6963
6964#ifdef IEM_WITH_VEX
6965
6966IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
6967{
6968 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
6969 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
6970 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
6971 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
6972 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6973 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6974 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6975 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6976}
6977
6978
6979IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
6980{
6981 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
6982 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
6983 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
6984 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
6985 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
6986 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
6987 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
6988 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
6989}
6990
6991#endif /* IEM_WITH_VEX */
6992
6993
6994IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
6995{
6996 RT_NOREF(pFpuState);
6997 puDst->au32[0] = puSrc->au32[1];
6998 puDst->au32[1] = puSrc->au32[1];
6999 puDst->au32[2] = puSrc->au32[3];
7000 puDst->au32[3] = puSrc->au32[3];
7001}
7002
7003
7004IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, uint64_t uSrc))
7005{
7006 RT_NOREF(pFpuState);
7007 puDst->au64[0] = uSrc;
7008 puDst->au64[1] = uSrc;
7009}
7010
7011#ifdef IEM_WITH_VEX
7012
7013IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7014{
7015 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7016 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7017 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7018 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7019}
7020
7021IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7022{
7023 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7024 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7025 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7026 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7027}
7028
7029#endif /* IEM_WITH_VEX */
7030
7031
7032/*
7033 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7034 */
7035#ifdef IEM_WITHOUT_ASSEMBLY
7036
7037IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7038{
7039 RT_NOREF(pFpuState);
7040 *puDst &= *puSrc;
7041}
7042
7043
7044IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7045{
7046 RT_NOREF(pFpuState);
7047 puDst->au64[0] &= puSrc->au64[0];
7048 puDst->au64[1] &= puSrc->au64[1];
7049}
7050
7051#endif
7052
7053IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7054 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7055{
7056 RT_NOREF(pExtState);
7057 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7058 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7059}
7060
7061
7062IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7063 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7064{
7065 RT_NOREF(pExtState);
7066 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7067 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7068 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7069 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7070}
7071
7072
7073/*
7074 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7075 */
7076#ifdef IEM_WITHOUT_ASSEMBLY
7077
7078IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7079{
7080 RT_NOREF(pFpuState);
7081 *puDst = ~*puDst & *puSrc;
7082}
7083
7084
7085IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7086{
7087 RT_NOREF(pFpuState);
7088 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7089 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7090}
7091
7092#endif
7093
7094IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7095 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7096{
7097 RT_NOREF(pExtState);
7098 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7099 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7100}
7101
7102
7103IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7104 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7105{
7106 RT_NOREF(pExtState);
7107 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7108 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7109 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7110 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7111}
7112
7113
7114/*
7115 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7116 */
7117#ifdef IEM_WITHOUT_ASSEMBLY
7118
7119IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7120{
7121 RT_NOREF(pFpuState);
7122 *puDst |= *puSrc;
7123}
7124
7125
7126IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7127{
7128 RT_NOREF(pFpuState);
7129 puDst->au64[0] |= puSrc->au64[0];
7130 puDst->au64[1] |= puSrc->au64[1];
7131}
7132
7133#endif
7134
7135IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7136 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7137{
7138 RT_NOREF(pExtState);
7139 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7140 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7141}
7142
7143
7144IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7145 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7146{
7147 RT_NOREF(pExtState);
7148 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7149 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7150 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7151 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7152}
7153
7154
7155/*
7156 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7157 */
7158#ifdef IEM_WITHOUT_ASSEMBLY
7159
7160IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7161{
7162 RT_NOREF(pFpuState);
7163 *puDst ^= *puSrc;
7164}
7165
7166
7167IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7168{
7169 RT_NOREF(pFpuState);
7170 puDst->au64[0] ^= puSrc->au64[0];
7171 puDst->au64[1] ^= puSrc->au64[1];
7172}
7173
7174#endif
7175
7176IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7177 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7178{
7179 RT_NOREF(pExtState);
7180 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7181 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7182}
7183
7184
7185IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7186 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7187{
7188 RT_NOREF(pExtState);
7189 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7190 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7191 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7192 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7193}
7194
7195
7196/*
7197 * PCMPEQB / VPCMPEQB
7198 */
7199#ifdef IEM_WITHOUT_ASSEMBLY
7200
7201IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7202{
7203 RT_NOREF(pFpuState);
7204 RTUINT64U uSrc1 = { *puDst };
7205 RTUINT64U uSrc2 = { *puSrc };
7206 RTUINT64U uDst;
7207 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7208 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7209 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7210 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7211 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7212 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7213 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7214 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7215 *puDst = uDst.u;
7216}
7217
7218
7219IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7220{
7221 RT_NOREF(pFpuState);
7222 RTUINT128U uSrc1 = *puDst;
7223 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7224 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7225 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7226 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7227 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7228 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7229 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7230 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7231 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7232 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7233 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7234 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7235 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7236 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7237 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7238 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7239}
7240
7241#endif
7242
7243IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7244 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7245{
7246 RT_NOREF(pExtState);
7247 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7248 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7249 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7250 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7251 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7252 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7253 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7254 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7255 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7256 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7257 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7258 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7259 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7260 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7261 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7262 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7263}
7264
7265IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7266 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7267{
7268 RT_NOREF(pExtState);
7269 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7270 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7271 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7272 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7273 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7274 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7275 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7276 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7277 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7278 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7279 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7280 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7281 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7282 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7283 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7284 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7285 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7286 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7287 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7288 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7289 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7290 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7291 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7292 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7293 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7294 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7295 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7296 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7297 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7298 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7299 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7300 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7301}
7302
7303
7304/*
7305 * PCMPEQW / VPCMPEQW
7306 */
7307#ifdef IEM_WITHOUT_ASSEMBLY
7308
7309IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7310{
7311 RT_NOREF(pFpuState);
7312 RTUINT64U uSrc1 = { *puDst };
7313 RTUINT64U uSrc2 = { *puSrc };
7314 RTUINT64U uDst;
7315 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7316 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7317 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7318 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7319 *puDst = uDst.u;
7320}
7321
7322
7323IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7324{
7325 RT_NOREF(pFpuState);
7326 RTUINT128U uSrc1 = *puDst;
7327 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7328 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7329 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7330 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7331 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7332 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7333 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7334 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7335}
7336
7337#endif
7338
7339IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7340 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7341{
7342 RT_NOREF(pExtState);
7343 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7344 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7345 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7346 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7347 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7348 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7349 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7350 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7351}
7352
7353IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7354 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7355{
7356 RT_NOREF(pExtState);
7357 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7358 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7359 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7360 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7361 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7362 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7363 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7364 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7365 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7366 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7367 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7368 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7369 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7370 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7371 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7372 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7373}
7374
7375
7376/*
7377 * PCMPEQD / VPCMPEQD.
7378 */
7379#ifdef IEM_WITHOUT_ASSEMBLY
7380
7381IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7382{
7383 RT_NOREF(pFpuState);
7384 RTUINT64U uSrc1 = { *puDst };
7385 RTUINT64U uSrc2 = { *puSrc };
7386 RTUINT64U uDst;
7387 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7388 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7389 *puDst = uDst.u;
7390}
7391
7392
7393IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7394{
7395 RT_NOREF(pFpuState);
7396 RTUINT128U uSrc1 = *puDst;
7397 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7398 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7399 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7400 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7401}
7402
7403#endif /* IEM_WITHOUT_ASSEMBLY */
7404
7405IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7406 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7407{
7408 RT_NOREF(pExtState);
7409 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7410 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7411 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7412 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7413}
7414
7415IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7416 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7417{
7418 RT_NOREF(pExtState);
7419 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7420 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7421 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7422 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7423 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
7424 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
7425 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
7426 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
7427}
7428
7429
7430/*
7431 * PCMPEQQ / VPCMPEQQ.
7432 */
7433IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7434{
7435 RT_NOREF(pFpuState);
7436 RTUINT128U uSrc1 = *puDst;
7437 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
7438 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
7439}
7440
7441IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7442 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7443{
7444 RT_NOREF(pExtState);
7445 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7446 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7447}
7448
7449IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7450 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7451{
7452 RT_NOREF(pExtState);
7453 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7454 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7455 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
7456 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
7457}
7458
7459
7460/*
7461 * PCMPGTB / VPCMPGTB
7462 */
7463#ifdef IEM_WITHOUT_ASSEMBLY
7464
7465IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7466{
7467 RT_NOREF(pFpuState);
7468 RTUINT64U uSrc1 = { *puDst };
7469 RTUINT64U uSrc2 = { *puSrc };
7470 RTUINT64U uDst;
7471 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
7472 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
7473 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
7474 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
7475 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
7476 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
7477 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
7478 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
7479 *puDst = uDst.u;
7480}
7481
7482
7483IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7484{
7485 RT_NOREF(pFpuState);
7486 RTUINT128U uSrc1 = *puDst;
7487 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
7488 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
7489 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
7490 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
7491 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
7492 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
7493 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
7494 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
7495 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
7496 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
7497 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
7498 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
7499 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
7500 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
7501 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
7502 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
7503}
7504
7505#endif
7506
7507IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7508 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7509{
7510 RT_NOREF(pExtState);
7511 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7512 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7513 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7514 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7515 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7516 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7517 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7518 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7519 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7520 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7521 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7522 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7523 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7524 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7525 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7526 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7527}
7528
7529IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7530 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7531{
7532 RT_NOREF(pExtState);
7533 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7534 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7535 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7536 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7537 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7538 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7539 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7540 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7541 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7542 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7543 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7544 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7545 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7546 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7547 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7548 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7549 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
7550 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
7551 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
7552 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
7553 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
7554 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
7555 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
7556 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
7557 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
7558 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
7559 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
7560 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
7561 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
7562 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
7563 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
7564 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
7565}
7566
7567
7568/*
7569 * PCMPGTW / VPCMPGTW
7570 */
7571#ifdef IEM_WITHOUT_ASSEMBLY
7572
7573IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7574{
7575 RT_NOREF(pFpuState);
7576 RTUINT64U uSrc1 = { *puDst };
7577 RTUINT64U uSrc2 = { *puSrc };
7578 RTUINT64U uDst;
7579 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
7580 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
7581 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
7582 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
7583 *puDst = uDst.u;
7584}
7585
7586
7587IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7588{
7589 RT_NOREF(pFpuState);
7590 RTUINT128U uSrc1 = *puDst;
7591 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
7592 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
7593 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
7594 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
7595 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
7596 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
7597 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
7598 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
7599}
7600
7601#endif
7602
7603IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7604 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7605{
7606 RT_NOREF(pExtState);
7607 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7608 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7609 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7610 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7611 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7612 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7613 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7614 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7615}
7616
7617IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7618 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7619{
7620 RT_NOREF(pExtState);
7621 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7622 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7623 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7624 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7625 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7626 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7627 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7628 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7629 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
7630 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
7631 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
7632 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
7633 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
7634 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
7635 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
7636 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
7637}
7638
7639
7640/*
7641 * PCMPGTD / VPCMPGTD.
7642 */
7643#ifdef IEM_WITHOUT_ASSEMBLY
7644
7645IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7646{
7647 RT_NOREF(pFpuState);
7648 RTUINT64U uSrc1 = { *puDst };
7649 RTUINT64U uSrc2 = { *puSrc };
7650 RTUINT64U uDst;
7651 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
7652 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
7653 *puDst = uDst.u;
7654}
7655
7656
7657IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7658{
7659 RT_NOREF(pFpuState);
7660 RTUINT128U uSrc1 = *puDst;
7661 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
7662 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
7663 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
7664 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
7665}
7666
7667#endif /* IEM_WITHOUT_ASSEMBLY */
7668
7669IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7670 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7671{
7672 RT_NOREF(pExtState);
7673 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7674 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7675 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7676 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7677}
7678
7679IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7680 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7681{
7682 RT_NOREF(pExtState);
7683 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7684 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7685 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7686 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7687 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
7688 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
7689 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
7690 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
7691}
7692
7693
7694/*
7695 * PCMPGTQ / VPCMPGTQ.
7696 */
7697IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7698{
7699 RT_NOREF(pFpuState);
7700 RTUINT128U uSrc1 = *puDst;
7701 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
7702 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
7703}
7704
7705IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7706 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7707{
7708 RT_NOREF(pExtState);
7709 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7710 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7711}
7712
7713IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7714 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7715{
7716 RT_NOREF(pExtState);
7717 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7718 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7719 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
7720 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
7721}
7722
7723
7724/*
7725 * PADDB / VPADDB
7726 */
7727#ifdef IEM_WITHOUT_ASSEMBLY
7728
7729IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7730{
7731 RT_NOREF(pFpuState);
7732 RTUINT64U uSrc1 = { *puDst };
7733 RTUINT64U uSrc2 = { *puSrc };
7734 RTUINT64U uDst;
7735 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
7736 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
7737 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
7738 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
7739 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
7740 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
7741 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
7742 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
7743 *puDst = uDst.u;
7744}
7745
7746
7747IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7748{
7749 RT_NOREF(pFpuState);
7750 RTUINT128U uSrc1 = *puDst;
7751 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
7752 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
7753 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
7754 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
7755 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
7756 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
7757 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
7758 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
7759 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
7760 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
7761 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
7762 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
7763 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
7764 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
7765 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
7766 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
7767}
7768
7769#endif
7770
7771IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7772 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7773{
7774 RT_NOREF(pExtState);
7775 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7776 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7777 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7778 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7779 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7780 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7781 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7782 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7783 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7784 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7785 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7786 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7787 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7788 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7789 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7790 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7791}
7792
7793IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7794 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7795{
7796 RT_NOREF(pExtState);
7797 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7798 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7799 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7800 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7801 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7802 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7803 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7804 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7805 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7806 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7807 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7808 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7809 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7810 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7811 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7812 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7813 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
7814 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
7815 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
7816 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
7817 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
7818 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
7819 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
7820 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
7821 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
7822 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
7823 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
7824 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
7825 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
7826 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
7827 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
7828 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
7829}
7830
7831
7832/*
7833 * PADDW / VPADDW
7834 */
7835#ifdef IEM_WITHOUT_ASSEMBLY
7836
7837IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7838{
7839 RT_NOREF(pFpuState);
7840 RTUINT64U uSrc1 = { *puDst };
7841 RTUINT64U uSrc2 = { *puSrc };
7842 RTUINT64U uDst;
7843 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
7844 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
7845 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
7846 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
7847 *puDst = uDst.u;
7848}
7849
7850
7851IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7852{
7853 RT_NOREF(pFpuState);
7854 RTUINT128U uSrc1 = *puDst;
7855 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
7856 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
7857 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
7858 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
7859 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
7860 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
7861 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
7862 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
7863}
7864
7865#endif
7866
7867IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7868 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7869{
7870 RT_NOREF(pExtState);
7871 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
7872 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
7873 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
7874 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
7875 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
7876 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
7877 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
7878 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
7879}
7880
7881IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7882 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7883{
7884 RT_NOREF(pExtState);
7885 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
7886 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
7887 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
7888 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
7889 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
7890 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
7891 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
7892 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
7893 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
7894 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
7895 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
7896 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
7897 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
7898 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
7899 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
7900 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
7901}
7902
7903
7904/*
7905 * PADDD / VPADDD.
7906 */
7907#ifdef IEM_WITHOUT_ASSEMBLY
7908
7909IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7910{
7911 RT_NOREF(pFpuState);
7912 RTUINT64U uSrc1 = { *puDst };
7913 RTUINT64U uSrc2 = { *puSrc };
7914 RTUINT64U uDst;
7915 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
7916 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
7917 *puDst = uDst.u;
7918}
7919
7920
7921IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7922{
7923 RT_NOREF(pFpuState);
7924 RTUINT128U uSrc1 = *puDst;
7925 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
7926 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
7927 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
7928 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
7929}
7930
7931#endif /* IEM_WITHOUT_ASSEMBLY */
7932
7933IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7934 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7935{
7936 RT_NOREF(pExtState);
7937 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
7938 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
7939 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
7940 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
7941}
7942
7943IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7944 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7945{
7946 RT_NOREF(pExtState);
7947 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
7948 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
7949 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
7950 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
7951 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
7952 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
7953 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
7954 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
7955}
7956
7957
7958/*
7959 * PADDQ / VPADDQ.
7960 */
7961#ifdef IEM_WITHOUT_ASSEMBLY
7962
7963IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7964{
7965 RT_NOREF(pFpuState);
7966 *puDst = *puDst + *puSrc;
7967}
7968
7969IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7970{
7971 RT_NOREF(pFpuState);
7972 RTUINT128U uSrc1 = *puDst;
7973 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
7974 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
7975}
7976
7977#endif
7978
7979IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7980 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7981{
7982 RT_NOREF(pExtState);
7983 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
7984 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
7985}
7986
7987IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7988 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7989{
7990 RT_NOREF(pExtState);
7991 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
7992 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
7993 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
7994 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
7995}
7996
7997
7998/*
7999 *
8000 */
8001
8002#ifdef IEM_WITHOUT_ASSEMBLY
8003
8004IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
8005{
8006 RT_NOREF(pFpuState, pu64Dst, pu64Src);
8007 AssertReleaseFailed();
8008
8009}
8010
8011
8012IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, PCRTUINT128U pu128Src))
8013{
8014 RT_NOREF(pFpuState, pu64Dst, pu128Src);
8015 AssertReleaseFailed();
8016}
8017
8018
8019IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src, uint8_t bEvil))
8020{
8021 RT_NOREF(pFpuState, pu64Dst, pu64Src, bEvil);
8022 AssertReleaseFailed();
8023}
8024
8025
8026IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src, uint8_t bEvil))
8027{
8028 RT_NOREF(pFpuState, pu128Dst, pu128Src, bEvil);
8029 AssertReleaseFailed();
8030}
8031
8032
8033IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src, uint8_t bEvil))
8034{
8035 RT_NOREF(pFpuState, pu128Dst, pu128Src, bEvil);
8036 AssertReleaseFailed();
8037}
8038
8039
8040IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src, uint8_t bEvil))
8041{
8042 RT_NOREF(pFpuState, pu128Dst, pu128Src, bEvil);
8043 AssertReleaseFailed();
8044}
8045
8046/* PUNPCKHxxx */
8047
8048IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
8049{
8050 RT_NOREF(pFpuState, pu64Dst, pu64Src);
8051 AssertReleaseFailed();
8052}
8053
8054
8055IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
8056{
8057 RT_NOREF(pFpuState, pu128Dst, pu128Src);
8058 AssertReleaseFailed();
8059}
8060
8061
8062IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
8063{
8064 RT_NOREF(pFpuState, pu64Dst, pu64Src);
8065 AssertReleaseFailed();
8066}
8067
8068
8069IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
8070{
8071 RT_NOREF(pFpuState, pu128Dst, pu128Src);
8072 AssertReleaseFailed();
8073}
8074
8075
8076IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
8077{
8078 RT_NOREF(pFpuState, pu64Dst, pu64Src);
8079 AssertReleaseFailed();
8080}
8081
8082
8083IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
8084{
8085 RT_NOREF(pFpuState, pu128Dst, pu128Src);
8086 AssertReleaseFailed();
8087}
8088
8089
8090IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
8091{
8092 RT_NOREF(pFpuState, pu128Dst, pu128Src);
8093 AssertReleaseFailed();
8094}
8095
8096/* PUNPCKLxxx */
8097
8098IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint32_t const *pu32Src))
8099{
8100 RT_NOREF(pFpuState, pu64Dst, pu32Src);
8101 AssertReleaseFailed();
8102}
8103
8104
8105IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, uint64_t const *pu64Src))
8106{
8107 RT_NOREF(pFpuState, pu128Dst, pu64Src);
8108 AssertReleaseFailed();
8109}
8110
8111
8112IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint32_t const *pu32Src))
8113{
8114 RT_NOREF(pFpuState, pu64Dst, pu32Src);
8115 AssertReleaseFailed();
8116}
8117
8118
8119IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, uint64_t const *pu64Src))
8120{
8121 RT_NOREF(pFpuState, pu128Dst, pu64Src);
8122 AssertReleaseFailed();
8123}
8124
8125
8126IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint32_t const *pu32Src))
8127{
8128 RT_NOREF(pFpuState, pu64Dst, pu32Src);
8129 AssertReleaseFailed();
8130}
8131
8132
8133IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, uint64_t const *pu64Src))
8134{
8135 RT_NOREF(pFpuState, pu128Dst, pu64Src);
8136 AssertReleaseFailed();
8137}
8138
8139
8140IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, uint64_t const *pu64Src))
8141{
8142 RT_NOREF(pFpuState, pu128Dst, pu64Src);
8143 AssertReleaseFailed();
8144}
8145
8146#endif /* IEM_WITHOUT_ASSEMBLY */
8147
8148
8149/*
8150 * CRC32 (SEE 4.2).
8151 */
8152
8153IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
8154{
8155 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
8156}
8157
8158
8159IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
8160{
8161 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
8162}
8163
8164IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
8165{
8166 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
8167}
8168
8169IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
8170{
8171 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
8172}
8173
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette