VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 95562

Last change on this file since 95562 was 95543, checked in by vboxsync, 3 years ago

VMM/IEM: vmovshdup. Removed unused parameter from iemAImpl_movshdup and iemAImpl_movddup. bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 367.2 KB
Line 
1/* $Id: IEMAllAImplC.cpp 95543 2022-07-06 21:02:13Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*********************************************************************************************************************************
20* Header Files *
21*********************************************************************************************************************************/
22#include "IEMInternal.h"
23#include <VBox/vmm/vmcc.h>
24#include <iprt/errcore.h>
25#include <iprt/x86.h>
26#include <iprt/uint128.h>
27#include <iprt/uint256.h>
28#include <iprt/crc.h>
29
30RT_C_DECLS_BEGIN
31#include <softfloat.h>
32RT_C_DECLS_END
33
34
35/*********************************************************************************************************************************
36* Defined Constants And Macros *
37*********************************************************************************************************************************/
38/** @def IEM_WITHOUT_ASSEMBLY
39 * Enables all the code in this file.
40 */
41#if !defined(IEM_WITHOUT_ASSEMBLY)
42# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
43# define IEM_WITHOUT_ASSEMBLY
44# endif
45#endif
46/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
47#ifdef IEM_WITH_ASSEMBLY
48# undef IEM_WITHOUT_ASSEMBLY
49#endif
50
51/**
52 * Calculates the signed flag value given a result and it's bit width.
53 *
54 * The signed flag (SF) is a duplication of the most significant bit in the
55 * result.
56 *
57 * @returns X86_EFL_SF or 0.
58 * @param a_uResult Unsigned result value.
59 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
60 */
61#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
62 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
63
64/**
65 * Calculates the zero flag value given a result.
66 *
67 * The zero flag (ZF) indicates whether the result is zero or not.
68 *
69 * @returns X86_EFL_ZF or 0.
70 * @param a_uResult Unsigned result value.
71 */
72#define X86_EFL_CALC_ZF(a_uResult) \
73 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
74
75/**
76 * Extracts the OF flag from a OF calculation result.
77 *
78 * These are typically used by concating with a bitcount. The problem is that
79 * 8-bit values needs shifting in the other direction than the others.
80 */
81#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
82#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
83#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
84#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
85
86/**
87 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
88 *
89 * @returns Status bits.
90 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
91 * @param a_uResult Unsigned result value.
92 * @param a_uSrc The source value (for AF calc).
93 * @param a_uDst The original destination value (for AF calc).
94 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
95 * @param a_CfExpr Bool expression for the carry flag (CF).
96 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
97 */
98#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
99 do { \
100 uint32_t fEflTmp = *(a_pfEFlags); \
101 fEflTmp &= ~X86_EFL_STATUS_BITS; \
102 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
103 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
104 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
105 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
106 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
107 \
108 /* Overflow during ADDition happens when both inputs have the same signed \
109 bit value and the result has a different sign bit value. \
110 \
111 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
112 follows that for SUBtraction the signed bit value must differ between \
113 the two inputs and the result's signed bit diff from the first input. \
114 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
115 \
116 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
117 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
118 & RT_BIT_64(a_cBitsWidth - 1)) \
119 & ((a_uResult) ^ (a_uDst)) ); \
120 *(a_pfEFlags) = fEflTmp; \
121 } while (0)
122
123/**
124 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
125 *
126 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
127 * undefined. We do not set AF, as that seems to make the most sense (which
128 * probably makes it the most wrong in real life).
129 *
130 * @returns Status bits.
131 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
132 * @param a_uResult Unsigned result value.
133 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
134 * @param a_fExtra Additional bits to set.
135 */
136#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
137 do { \
138 uint32_t fEflTmp = *(a_pfEFlags); \
139 fEflTmp &= ~X86_EFL_STATUS_BITS; \
140 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
141 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
142 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
143 fEflTmp |= (a_fExtra); \
144 *(a_pfEFlags) = fEflTmp; \
145 } while (0)
146
147
148/*********************************************************************************************************************************
149* Global Variables *
150*********************************************************************************************************************************/
151/**
152 * Parity calculation table.
153 *
154 * This is also used by iemAllAImpl.asm.
155 *
156 * The generator code:
157 * @code
158 * #include <stdio.h>
159 *
160 * int main()
161 * {
162 * unsigned b;
163 * for (b = 0; b < 256; b++)
164 * {
165 * int cOnes = ( b & 1)
166 * + ((b >> 1) & 1)
167 * + ((b >> 2) & 1)
168 * + ((b >> 3) & 1)
169 * + ((b >> 4) & 1)
170 * + ((b >> 5) & 1)
171 * + ((b >> 6) & 1)
172 * + ((b >> 7) & 1);
173 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
174 * b,
175 * (b >> 7) & 1,
176 * (b >> 6) & 1,
177 * (b >> 5) & 1,
178 * (b >> 4) & 1,
179 * (b >> 3) & 1,
180 * (b >> 2) & 1,
181 * (b >> 1) & 1,
182 * b & 1,
183 * cOnes & 1 ? "0" : "X86_EFL_PF");
184 * }
185 * return 0;
186 * }
187 * @endcode
188 */
189uint8_t const g_afParity[256] =
190{
191 /* 0000 = 00000000b */ X86_EFL_PF,
192 /* 0x01 = 00000001b */ 0,
193 /* 0x02 = 00000010b */ 0,
194 /* 0x03 = 00000011b */ X86_EFL_PF,
195 /* 0x04 = 00000100b */ 0,
196 /* 0x05 = 00000101b */ X86_EFL_PF,
197 /* 0x06 = 00000110b */ X86_EFL_PF,
198 /* 0x07 = 00000111b */ 0,
199 /* 0x08 = 00001000b */ 0,
200 /* 0x09 = 00001001b */ X86_EFL_PF,
201 /* 0x0a = 00001010b */ X86_EFL_PF,
202 /* 0x0b = 00001011b */ 0,
203 /* 0x0c = 00001100b */ X86_EFL_PF,
204 /* 0x0d = 00001101b */ 0,
205 /* 0x0e = 00001110b */ 0,
206 /* 0x0f = 00001111b */ X86_EFL_PF,
207 /* 0x10 = 00010000b */ 0,
208 /* 0x11 = 00010001b */ X86_EFL_PF,
209 /* 0x12 = 00010010b */ X86_EFL_PF,
210 /* 0x13 = 00010011b */ 0,
211 /* 0x14 = 00010100b */ X86_EFL_PF,
212 /* 0x15 = 00010101b */ 0,
213 /* 0x16 = 00010110b */ 0,
214 /* 0x17 = 00010111b */ X86_EFL_PF,
215 /* 0x18 = 00011000b */ X86_EFL_PF,
216 /* 0x19 = 00011001b */ 0,
217 /* 0x1a = 00011010b */ 0,
218 /* 0x1b = 00011011b */ X86_EFL_PF,
219 /* 0x1c = 00011100b */ 0,
220 /* 0x1d = 00011101b */ X86_EFL_PF,
221 /* 0x1e = 00011110b */ X86_EFL_PF,
222 /* 0x1f = 00011111b */ 0,
223 /* 0x20 = 00100000b */ 0,
224 /* 0x21 = 00100001b */ X86_EFL_PF,
225 /* 0x22 = 00100010b */ X86_EFL_PF,
226 /* 0x23 = 00100011b */ 0,
227 /* 0x24 = 00100100b */ X86_EFL_PF,
228 /* 0x25 = 00100101b */ 0,
229 /* 0x26 = 00100110b */ 0,
230 /* 0x27 = 00100111b */ X86_EFL_PF,
231 /* 0x28 = 00101000b */ X86_EFL_PF,
232 /* 0x29 = 00101001b */ 0,
233 /* 0x2a = 00101010b */ 0,
234 /* 0x2b = 00101011b */ X86_EFL_PF,
235 /* 0x2c = 00101100b */ 0,
236 /* 0x2d = 00101101b */ X86_EFL_PF,
237 /* 0x2e = 00101110b */ X86_EFL_PF,
238 /* 0x2f = 00101111b */ 0,
239 /* 0x30 = 00110000b */ X86_EFL_PF,
240 /* 0x31 = 00110001b */ 0,
241 /* 0x32 = 00110010b */ 0,
242 /* 0x33 = 00110011b */ X86_EFL_PF,
243 /* 0x34 = 00110100b */ 0,
244 /* 0x35 = 00110101b */ X86_EFL_PF,
245 /* 0x36 = 00110110b */ X86_EFL_PF,
246 /* 0x37 = 00110111b */ 0,
247 /* 0x38 = 00111000b */ 0,
248 /* 0x39 = 00111001b */ X86_EFL_PF,
249 /* 0x3a = 00111010b */ X86_EFL_PF,
250 /* 0x3b = 00111011b */ 0,
251 /* 0x3c = 00111100b */ X86_EFL_PF,
252 /* 0x3d = 00111101b */ 0,
253 /* 0x3e = 00111110b */ 0,
254 /* 0x3f = 00111111b */ X86_EFL_PF,
255 /* 0x40 = 01000000b */ 0,
256 /* 0x41 = 01000001b */ X86_EFL_PF,
257 /* 0x42 = 01000010b */ X86_EFL_PF,
258 /* 0x43 = 01000011b */ 0,
259 /* 0x44 = 01000100b */ X86_EFL_PF,
260 /* 0x45 = 01000101b */ 0,
261 /* 0x46 = 01000110b */ 0,
262 /* 0x47 = 01000111b */ X86_EFL_PF,
263 /* 0x48 = 01001000b */ X86_EFL_PF,
264 /* 0x49 = 01001001b */ 0,
265 /* 0x4a = 01001010b */ 0,
266 /* 0x4b = 01001011b */ X86_EFL_PF,
267 /* 0x4c = 01001100b */ 0,
268 /* 0x4d = 01001101b */ X86_EFL_PF,
269 /* 0x4e = 01001110b */ X86_EFL_PF,
270 /* 0x4f = 01001111b */ 0,
271 /* 0x50 = 01010000b */ X86_EFL_PF,
272 /* 0x51 = 01010001b */ 0,
273 /* 0x52 = 01010010b */ 0,
274 /* 0x53 = 01010011b */ X86_EFL_PF,
275 /* 0x54 = 01010100b */ 0,
276 /* 0x55 = 01010101b */ X86_EFL_PF,
277 /* 0x56 = 01010110b */ X86_EFL_PF,
278 /* 0x57 = 01010111b */ 0,
279 /* 0x58 = 01011000b */ 0,
280 /* 0x59 = 01011001b */ X86_EFL_PF,
281 /* 0x5a = 01011010b */ X86_EFL_PF,
282 /* 0x5b = 01011011b */ 0,
283 /* 0x5c = 01011100b */ X86_EFL_PF,
284 /* 0x5d = 01011101b */ 0,
285 /* 0x5e = 01011110b */ 0,
286 /* 0x5f = 01011111b */ X86_EFL_PF,
287 /* 0x60 = 01100000b */ X86_EFL_PF,
288 /* 0x61 = 01100001b */ 0,
289 /* 0x62 = 01100010b */ 0,
290 /* 0x63 = 01100011b */ X86_EFL_PF,
291 /* 0x64 = 01100100b */ 0,
292 /* 0x65 = 01100101b */ X86_EFL_PF,
293 /* 0x66 = 01100110b */ X86_EFL_PF,
294 /* 0x67 = 01100111b */ 0,
295 /* 0x68 = 01101000b */ 0,
296 /* 0x69 = 01101001b */ X86_EFL_PF,
297 /* 0x6a = 01101010b */ X86_EFL_PF,
298 /* 0x6b = 01101011b */ 0,
299 /* 0x6c = 01101100b */ X86_EFL_PF,
300 /* 0x6d = 01101101b */ 0,
301 /* 0x6e = 01101110b */ 0,
302 /* 0x6f = 01101111b */ X86_EFL_PF,
303 /* 0x70 = 01110000b */ 0,
304 /* 0x71 = 01110001b */ X86_EFL_PF,
305 /* 0x72 = 01110010b */ X86_EFL_PF,
306 /* 0x73 = 01110011b */ 0,
307 /* 0x74 = 01110100b */ X86_EFL_PF,
308 /* 0x75 = 01110101b */ 0,
309 /* 0x76 = 01110110b */ 0,
310 /* 0x77 = 01110111b */ X86_EFL_PF,
311 /* 0x78 = 01111000b */ X86_EFL_PF,
312 /* 0x79 = 01111001b */ 0,
313 /* 0x7a = 01111010b */ 0,
314 /* 0x7b = 01111011b */ X86_EFL_PF,
315 /* 0x7c = 01111100b */ 0,
316 /* 0x7d = 01111101b */ X86_EFL_PF,
317 /* 0x7e = 01111110b */ X86_EFL_PF,
318 /* 0x7f = 01111111b */ 0,
319 /* 0x80 = 10000000b */ 0,
320 /* 0x81 = 10000001b */ X86_EFL_PF,
321 /* 0x82 = 10000010b */ X86_EFL_PF,
322 /* 0x83 = 10000011b */ 0,
323 /* 0x84 = 10000100b */ X86_EFL_PF,
324 /* 0x85 = 10000101b */ 0,
325 /* 0x86 = 10000110b */ 0,
326 /* 0x87 = 10000111b */ X86_EFL_PF,
327 /* 0x88 = 10001000b */ X86_EFL_PF,
328 /* 0x89 = 10001001b */ 0,
329 /* 0x8a = 10001010b */ 0,
330 /* 0x8b = 10001011b */ X86_EFL_PF,
331 /* 0x8c = 10001100b */ 0,
332 /* 0x8d = 10001101b */ X86_EFL_PF,
333 /* 0x8e = 10001110b */ X86_EFL_PF,
334 /* 0x8f = 10001111b */ 0,
335 /* 0x90 = 10010000b */ X86_EFL_PF,
336 /* 0x91 = 10010001b */ 0,
337 /* 0x92 = 10010010b */ 0,
338 /* 0x93 = 10010011b */ X86_EFL_PF,
339 /* 0x94 = 10010100b */ 0,
340 /* 0x95 = 10010101b */ X86_EFL_PF,
341 /* 0x96 = 10010110b */ X86_EFL_PF,
342 /* 0x97 = 10010111b */ 0,
343 /* 0x98 = 10011000b */ 0,
344 /* 0x99 = 10011001b */ X86_EFL_PF,
345 /* 0x9a = 10011010b */ X86_EFL_PF,
346 /* 0x9b = 10011011b */ 0,
347 /* 0x9c = 10011100b */ X86_EFL_PF,
348 /* 0x9d = 10011101b */ 0,
349 /* 0x9e = 10011110b */ 0,
350 /* 0x9f = 10011111b */ X86_EFL_PF,
351 /* 0xa0 = 10100000b */ X86_EFL_PF,
352 /* 0xa1 = 10100001b */ 0,
353 /* 0xa2 = 10100010b */ 0,
354 /* 0xa3 = 10100011b */ X86_EFL_PF,
355 /* 0xa4 = 10100100b */ 0,
356 /* 0xa5 = 10100101b */ X86_EFL_PF,
357 /* 0xa6 = 10100110b */ X86_EFL_PF,
358 /* 0xa7 = 10100111b */ 0,
359 /* 0xa8 = 10101000b */ 0,
360 /* 0xa9 = 10101001b */ X86_EFL_PF,
361 /* 0xaa = 10101010b */ X86_EFL_PF,
362 /* 0xab = 10101011b */ 0,
363 /* 0xac = 10101100b */ X86_EFL_PF,
364 /* 0xad = 10101101b */ 0,
365 /* 0xae = 10101110b */ 0,
366 /* 0xaf = 10101111b */ X86_EFL_PF,
367 /* 0xb0 = 10110000b */ 0,
368 /* 0xb1 = 10110001b */ X86_EFL_PF,
369 /* 0xb2 = 10110010b */ X86_EFL_PF,
370 /* 0xb3 = 10110011b */ 0,
371 /* 0xb4 = 10110100b */ X86_EFL_PF,
372 /* 0xb5 = 10110101b */ 0,
373 /* 0xb6 = 10110110b */ 0,
374 /* 0xb7 = 10110111b */ X86_EFL_PF,
375 /* 0xb8 = 10111000b */ X86_EFL_PF,
376 /* 0xb9 = 10111001b */ 0,
377 /* 0xba = 10111010b */ 0,
378 /* 0xbb = 10111011b */ X86_EFL_PF,
379 /* 0xbc = 10111100b */ 0,
380 /* 0xbd = 10111101b */ X86_EFL_PF,
381 /* 0xbe = 10111110b */ X86_EFL_PF,
382 /* 0xbf = 10111111b */ 0,
383 /* 0xc0 = 11000000b */ X86_EFL_PF,
384 /* 0xc1 = 11000001b */ 0,
385 /* 0xc2 = 11000010b */ 0,
386 /* 0xc3 = 11000011b */ X86_EFL_PF,
387 /* 0xc4 = 11000100b */ 0,
388 /* 0xc5 = 11000101b */ X86_EFL_PF,
389 /* 0xc6 = 11000110b */ X86_EFL_PF,
390 /* 0xc7 = 11000111b */ 0,
391 /* 0xc8 = 11001000b */ 0,
392 /* 0xc9 = 11001001b */ X86_EFL_PF,
393 /* 0xca = 11001010b */ X86_EFL_PF,
394 /* 0xcb = 11001011b */ 0,
395 /* 0xcc = 11001100b */ X86_EFL_PF,
396 /* 0xcd = 11001101b */ 0,
397 /* 0xce = 11001110b */ 0,
398 /* 0xcf = 11001111b */ X86_EFL_PF,
399 /* 0xd0 = 11010000b */ 0,
400 /* 0xd1 = 11010001b */ X86_EFL_PF,
401 /* 0xd2 = 11010010b */ X86_EFL_PF,
402 /* 0xd3 = 11010011b */ 0,
403 /* 0xd4 = 11010100b */ X86_EFL_PF,
404 /* 0xd5 = 11010101b */ 0,
405 /* 0xd6 = 11010110b */ 0,
406 /* 0xd7 = 11010111b */ X86_EFL_PF,
407 /* 0xd8 = 11011000b */ X86_EFL_PF,
408 /* 0xd9 = 11011001b */ 0,
409 /* 0xda = 11011010b */ 0,
410 /* 0xdb = 11011011b */ X86_EFL_PF,
411 /* 0xdc = 11011100b */ 0,
412 /* 0xdd = 11011101b */ X86_EFL_PF,
413 /* 0xde = 11011110b */ X86_EFL_PF,
414 /* 0xdf = 11011111b */ 0,
415 /* 0xe0 = 11100000b */ 0,
416 /* 0xe1 = 11100001b */ X86_EFL_PF,
417 /* 0xe2 = 11100010b */ X86_EFL_PF,
418 /* 0xe3 = 11100011b */ 0,
419 /* 0xe4 = 11100100b */ X86_EFL_PF,
420 /* 0xe5 = 11100101b */ 0,
421 /* 0xe6 = 11100110b */ 0,
422 /* 0xe7 = 11100111b */ X86_EFL_PF,
423 /* 0xe8 = 11101000b */ X86_EFL_PF,
424 /* 0xe9 = 11101001b */ 0,
425 /* 0xea = 11101010b */ 0,
426 /* 0xeb = 11101011b */ X86_EFL_PF,
427 /* 0xec = 11101100b */ 0,
428 /* 0xed = 11101101b */ X86_EFL_PF,
429 /* 0xee = 11101110b */ X86_EFL_PF,
430 /* 0xef = 11101111b */ 0,
431 /* 0xf0 = 11110000b */ X86_EFL_PF,
432 /* 0xf1 = 11110001b */ 0,
433 /* 0xf2 = 11110010b */ 0,
434 /* 0xf3 = 11110011b */ X86_EFL_PF,
435 /* 0xf4 = 11110100b */ 0,
436 /* 0xf5 = 11110101b */ X86_EFL_PF,
437 /* 0xf6 = 11110110b */ X86_EFL_PF,
438 /* 0xf7 = 11110111b */ 0,
439 /* 0xf8 = 11111000b */ 0,
440 /* 0xf9 = 11111001b */ X86_EFL_PF,
441 /* 0xfa = 11111010b */ X86_EFL_PF,
442 /* 0xfb = 11111011b */ 0,
443 /* 0xfc = 11111100b */ X86_EFL_PF,
444 /* 0xfd = 11111101b */ 0,
445 /* 0xfe = 11111110b */ 0,
446 /* 0xff = 11111111b */ X86_EFL_PF,
447};
448
449/* for clang: */
450extern const RTFLOAT80U g_ar80Zero[];
451extern const RTFLOAT80U g_ar80One[];
452extern const RTFLOAT80U g_r80Indefinite;
453extern const RTFLOAT80U g_ar80Infinity[];
454extern const RTFLOAT128U g_r128Ln2;
455extern const RTUINT128U g_u128Ln2Mantissa;
456extern const RTUINT128U g_u128Ln2MantissaIntel;
457extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
458
459/** Zero values (indexed by fSign). */
460RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
461
462/** One values (indexed by fSign). */
463RTFLOAT80U const g_ar80One[] =
464{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
465
466/** Indefinite (negative). */
467RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
468
469/** Infinities (indexed by fSign). */
470RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
471
472#if 0
473/** 128-bit floating point constant: 2.0 */
474const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
475#endif
476
477
478/* The next section is generated by tools/IEMGenFpuConstants: */
479
480/** The ln2 constant as 128-bit floating point value.
481 * base-10: 6.93147180559945309417232121458176575e-1
482 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
483 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
484//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
485const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
486/** High precision ln2 value.
487 * base-10: 6.931471805599453094172321214581765680747e-1
488 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
489 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
490const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
491/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
492 * base-10: 6.931471805599453094151379470289064954613e-1
493 * base-16: b.17217f7d1cf79abc0000000000000000@-1
494 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
495const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
496
497/** Horner constants for f2xm1 */
498const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
499{
500 /* a0
501 * base-10: 1.00000000000000000000000000000000000e0
502 * base-16: 1.0000000000000000000000000000@0
503 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
504 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
505 /* a1
506 * base-10: 5.00000000000000000000000000000000000e-1
507 * base-16: 8.0000000000000000000000000000@-1
508 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
509 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
510 /* a2
511 * base-10: 1.66666666666666666666666666666666658e-1
512 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
513 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
514 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
515 /* a3
516 * base-10: 4.16666666666666666666666666666666646e-2
517 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
518 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
519 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
520 /* a4
521 * base-10: 8.33333333333333333333333333333333323e-3
522 * base-16: 2.2222222222222222222222222222@-2
523 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
524 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
525 /* a5
526 * base-10: 1.38888888888888888888888888888888874e-3
527 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
528 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
529 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
530 /* a6
531 * base-10: 1.98412698412698412698412698412698412e-4
532 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
533 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
534 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
535 /* a7
536 * base-10: 2.48015873015873015873015873015873015e-5
537 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
538 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
539 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
540 /* a8
541 * base-10: 2.75573192239858906525573192239858902e-6
542 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
543 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
544 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
545 /* a9
546 * base-10: 2.75573192239858906525573192239858865e-7
547 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
548 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
549 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
550 /* a10
551 * base-10: 2.50521083854417187750521083854417184e-8
552 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
553 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
554 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
555 /* a11
556 * base-10: 2.08767569878680989792100903212014296e-9
557 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
558 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
559 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
560 /* a12
561 * base-10: 1.60590438368216145993923771701549472e-10
562 * base-16: b.092309d43684be51c198e91d7b40@-9
563 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
564 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
565 /* a13
566 * base-10: 1.14707455977297247138516979786821043e-11
567 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
568 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
569 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
570 /* a14
571 * base-10: 7.64716373181981647590113198578806964e-13
572 * base-16: d.73f9f399dc0f88ec32b587746578@-11
573 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
574 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
575 /* a15
576 * base-10: 4.77947733238738529743820749111754352e-14
577 * base-16: d.73f9f399dc0f88ec32b587746578@-12
578 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
579 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
580 /* a16
581 * base-10: 2.81145725434552076319894558301031970e-15
582 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
583 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
584 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
585 /* a17
586 * base-10: 1.56192069685862264622163643500573321e-16
587 * base-16: b.413c31dcbecbbdd8024435161550@-14
588 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
589 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
590 /* a18
591 * base-10: 8.22063524662432971695598123687227980e-18
592 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
593 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
594 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
595 /* a19
596 * base-10: 4.11031762331216485847799061843614006e-19
597 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
598 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
599 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
600 /* a20
601 * base-10: 7.04351638180413298434020229233492164e-20
602 * base-16: 1.4c9ee35db1d1f3c946fdcd48fd88@-16
603 * base-2 : 1.0100110010011110111000110101110110110001110100011111001111001001010001101111110111001101010010001111110110001000e-64 */
604 RTFLOAT128U_INIT_C(0, 0x4c9ee35db1d1, 0xf3c946fdcd48fd88, 0x3fbf),
605 /* a21
606 * base-10: 5.81527769640186708776361513365257702e-20
607 * base-16: 1.129e64bff606a2b9c9fc624481cd@-16
608 * base-2 : 1.0001001010011110011001001011111111110110000001101010001010111001110010011111110001100010010001001000000111001101e-64 */
609 RTFLOAT128U_INIT_C(0, 0x129e64bff606, 0xa2b9c9fc624481cd, 0x3fbf),
610};
611
612
613/*
614 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
615 * it all in C is probably safer atm., optimize what's necessary later, maybe.
616 */
617#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
618
619
620/*********************************************************************************************************************************
621* Binary Operations *
622*********************************************************************************************************************************/
623
624/*
625 * ADD
626 */
627
628IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
629{
630 uint64_t uDst = *puDst;
631 uint64_t uResult = uDst + uSrc;
632 *puDst = uResult;
633 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
634}
635
636# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
637
638IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
639{
640 uint32_t uDst = *puDst;
641 uint32_t uResult = uDst + uSrc;
642 *puDst = uResult;
643 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
644}
645
646
647IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
648{
649 uint16_t uDst = *puDst;
650 uint16_t uResult = uDst + uSrc;
651 *puDst = uResult;
652 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
653}
654
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
657{
658 uint8_t uDst = *puDst;
659 uint8_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
662}
663
664# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
665
666/*
667 * ADC
668 */
669
670IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
671{
672 if (!(*pfEFlags & X86_EFL_CF))
673 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
674 else
675 {
676 uint64_t uDst = *puDst;
677 uint64_t uResult = uDst + uSrc + 1;
678 *puDst = uResult;
679 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
680 }
681}
682
683# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
684
685IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
686{
687 if (!(*pfEFlags & X86_EFL_CF))
688 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
689 else
690 {
691 uint32_t uDst = *puDst;
692 uint32_t uResult = uDst + uSrc + 1;
693 *puDst = uResult;
694 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
695 }
696}
697
698
699IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
700{
701 if (!(*pfEFlags & X86_EFL_CF))
702 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
703 else
704 {
705 uint16_t uDst = *puDst;
706 uint16_t uResult = uDst + uSrc + 1;
707 *puDst = uResult;
708 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
709 }
710}
711
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint8_t uDst = *puDst;
720 uint8_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
723 }
724}
725
726# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
727
728/*
729 * SUB
730 */
731
732IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
733{
734 uint64_t uDst = *puDst;
735 uint64_t uResult = uDst - uSrc;
736 *puDst = uResult;
737 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
738}
739
740# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
741
742IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
743{
744 uint32_t uDst = *puDst;
745 uint32_t uResult = uDst - uSrc;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
748}
749
750
751IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
752{
753 uint16_t uDst = *puDst;
754 uint16_t uResult = uDst - uSrc;
755 *puDst = uResult;
756 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
757}
758
759
760IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
761{
762 uint8_t uDst = *puDst;
763 uint8_t uResult = uDst - uSrc;
764 *puDst = uResult;
765 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
766}
767
768# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
769
770/*
771 * SBB
772 */
773
774IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
775{
776 if (!(*pfEFlags & X86_EFL_CF))
777 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
778 else
779 {
780 uint64_t uDst = *puDst;
781 uint64_t uResult = uDst - uSrc - 1;
782 *puDst = uResult;
783 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
784 }
785}
786
787# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
790{
791 if (!(*pfEFlags & X86_EFL_CF))
792 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
793 else
794 {
795 uint32_t uDst = *puDst;
796 uint32_t uResult = uDst - uSrc - 1;
797 *puDst = uResult;
798 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
799 }
800}
801
802
803IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
804{
805 if (!(*pfEFlags & X86_EFL_CF))
806 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
807 else
808 {
809 uint16_t uDst = *puDst;
810 uint16_t uResult = uDst - uSrc - 1;
811 *puDst = uResult;
812 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
813 }
814}
815
816
817IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
818{
819 if (!(*pfEFlags & X86_EFL_CF))
820 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
821 else
822 {
823 uint8_t uDst = *puDst;
824 uint8_t uResult = uDst - uSrc - 1;
825 *puDst = uResult;
826 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
827 }
828}
829
830# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
831
832
833/*
834 * OR
835 */
836
837IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
838{
839 uint64_t uResult = *puDst | uSrc;
840 *puDst = uResult;
841 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
842}
843
844# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
845
846IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
847{
848 uint32_t uResult = *puDst | uSrc;
849 *puDst = uResult;
850 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
851}
852
853
854IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
855{
856 uint16_t uResult = *puDst | uSrc;
857 *puDst = uResult;
858 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
859}
860
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
863{
864 uint8_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
867}
868
869# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
870
871/*
872 * XOR
873 */
874
875IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
876{
877 uint64_t uResult = *puDst ^ uSrc;
878 *puDst = uResult;
879 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
880}
881
882# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
885{
886 uint32_t uResult = *puDst ^ uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
893{
894 uint16_t uResult = *puDst ^ uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
897}
898
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
901{
902 uint8_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
905}
906
907# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
908
909/*
910 * AND
911 */
912
913IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
914{
915 uint64_t const uResult = *puDst & uSrc;
916 *puDst = uResult;
917 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
918}
919
920# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
923{
924 uint32_t const uResult = *puDst & uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
931{
932 uint16_t const uResult = *puDst & uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
935}
936
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
939{
940 uint8_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
943}
944
945# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
946#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
947
948/*
949 * ANDN (BMI1 instruction)
950 */
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
953{
954 uint64_t const uResult = ~uSrc1 & uSrc2;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
961{
962 uint32_t const uResult = ~uSrc1 & uSrc2;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
965}
966
967
968#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
969IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
970{
971 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
972}
973#endif
974
975
976#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
978{
979 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
980}
981#endif
982
983#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
984
985/*
986 * CMP
987 */
988
989IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
990{
991 uint64_t uDstTmp = *puDst;
992 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
993}
994
995# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
996
997IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
998{
999 uint32_t uDstTmp = *puDst;
1000 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1001}
1002
1003
1004IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1005{
1006 uint16_t uDstTmp = *puDst;
1007 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1008}
1009
1010
1011IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1012{
1013 uint8_t uDstTmp = *puDst;
1014 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1015}
1016
1017# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1018
1019/*
1020 * TEST
1021 */
1022
1023IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1024{
1025 uint64_t uResult = *puDst & uSrc;
1026 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1027}
1028
1029# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1030
1031IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1032{
1033 uint32_t uResult = *puDst & uSrc;
1034 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1035}
1036
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1039{
1040 uint16_t uResult = *puDst & uSrc;
1041 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1042}
1043
1044
1045IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1046{
1047 uint8_t uResult = *puDst & uSrc;
1048 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1049}
1050
1051# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1052
1053
1054/*
1055 * LOCK prefixed variants of the above
1056 */
1057
1058/** 64-bit locked binary operand operation. */
1059# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1060 do { \
1061 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1062 uint ## a_cBitsWidth ## _t uTmp; \
1063 uint32_t fEflTmp; \
1064 do \
1065 { \
1066 uTmp = uOld; \
1067 fEflTmp = *pfEFlags; \
1068 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1069 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1070 *pfEFlags = fEflTmp; \
1071 } while (0)
1072
1073
1074#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1075 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1076 uint ## a_cBitsWidth ## _t uSrc, \
1077 uint32_t *pfEFlags)) \
1078 { \
1079 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1080 }
1081
1082EMIT_LOCKED_BIN_OP(add, 64)
1083EMIT_LOCKED_BIN_OP(adc, 64)
1084EMIT_LOCKED_BIN_OP(sub, 64)
1085EMIT_LOCKED_BIN_OP(sbb, 64)
1086EMIT_LOCKED_BIN_OP(or, 64)
1087EMIT_LOCKED_BIN_OP(xor, 64)
1088EMIT_LOCKED_BIN_OP(and, 64)
1089# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1090EMIT_LOCKED_BIN_OP(add, 32)
1091EMIT_LOCKED_BIN_OP(adc, 32)
1092EMIT_LOCKED_BIN_OP(sub, 32)
1093EMIT_LOCKED_BIN_OP(sbb, 32)
1094EMIT_LOCKED_BIN_OP(or, 32)
1095EMIT_LOCKED_BIN_OP(xor, 32)
1096EMIT_LOCKED_BIN_OP(and, 32)
1097
1098EMIT_LOCKED_BIN_OP(add, 16)
1099EMIT_LOCKED_BIN_OP(adc, 16)
1100EMIT_LOCKED_BIN_OP(sub, 16)
1101EMIT_LOCKED_BIN_OP(sbb, 16)
1102EMIT_LOCKED_BIN_OP(or, 16)
1103EMIT_LOCKED_BIN_OP(xor, 16)
1104EMIT_LOCKED_BIN_OP(and, 16)
1105
1106EMIT_LOCKED_BIN_OP(add, 8)
1107EMIT_LOCKED_BIN_OP(adc, 8)
1108EMIT_LOCKED_BIN_OP(sub, 8)
1109EMIT_LOCKED_BIN_OP(sbb, 8)
1110EMIT_LOCKED_BIN_OP(or, 8)
1111EMIT_LOCKED_BIN_OP(xor, 8)
1112EMIT_LOCKED_BIN_OP(and, 8)
1113# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1114
1115
1116/*
1117 * Bit operations (same signature as above).
1118 */
1119
1120/*
1121 * BT
1122 */
1123
1124IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1125{
1126 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1127 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1128 Assert(uSrc < 64);
1129 uint64_t uDst = *puDst;
1130 if (uDst & RT_BIT_64(uSrc))
1131 *pfEFlags |= X86_EFL_CF;
1132 else
1133 *pfEFlags &= ~X86_EFL_CF;
1134}
1135
1136# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1137
1138IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1139{
1140 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1141 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1142 Assert(uSrc < 32);
1143 uint32_t uDst = *puDst;
1144 if (uDst & RT_BIT_32(uSrc))
1145 *pfEFlags |= X86_EFL_CF;
1146 else
1147 *pfEFlags &= ~X86_EFL_CF;
1148}
1149
1150IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1151{
1152 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1153 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1154 Assert(uSrc < 16);
1155 uint16_t uDst = *puDst;
1156 if (uDst & RT_BIT_32(uSrc))
1157 *pfEFlags |= X86_EFL_CF;
1158 else
1159 *pfEFlags &= ~X86_EFL_CF;
1160}
1161
1162# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1163
1164/*
1165 * BTC
1166 */
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 64);
1173 uint64_t fMask = RT_BIT_64(uSrc);
1174 uint64_t uDst = *puDst;
1175 if (uDst & fMask)
1176 {
1177 uDst &= ~fMask;
1178 *puDst = uDst;
1179 *pfEFlags |= X86_EFL_CF;
1180 }
1181 else
1182 {
1183 uDst |= fMask;
1184 *puDst = uDst;
1185 *pfEFlags &= ~X86_EFL_CF;
1186 }
1187}
1188
1189# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1190
1191IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1192{
1193 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1194 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1195 Assert(uSrc < 32);
1196 uint32_t fMask = RT_BIT_32(uSrc);
1197 uint32_t uDst = *puDst;
1198 if (uDst & fMask)
1199 {
1200 uDst &= ~fMask;
1201 *puDst = uDst;
1202 *pfEFlags |= X86_EFL_CF;
1203 }
1204 else
1205 {
1206 uDst |= fMask;
1207 *puDst = uDst;
1208 *pfEFlags &= ~X86_EFL_CF;
1209 }
1210}
1211
1212
1213IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1214{
1215 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1216 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1217 Assert(uSrc < 16);
1218 uint16_t fMask = RT_BIT_32(uSrc);
1219 uint16_t uDst = *puDst;
1220 if (uDst & fMask)
1221 {
1222 uDst &= ~fMask;
1223 *puDst = uDst;
1224 *pfEFlags |= X86_EFL_CF;
1225 }
1226 else
1227 {
1228 uDst |= fMask;
1229 *puDst = uDst;
1230 *pfEFlags &= ~X86_EFL_CF;
1231 }
1232}
1233
1234# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1235
1236/*
1237 * BTR
1238 */
1239
1240IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1241{
1242 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1243 logical operation (AND/OR/whatever). */
1244 Assert(uSrc < 64);
1245 uint64_t fMask = RT_BIT_64(uSrc);
1246 uint64_t uDst = *puDst;
1247 if (uDst & fMask)
1248 {
1249 uDst &= ~fMask;
1250 *puDst = uDst;
1251 *pfEFlags |= X86_EFL_CF;
1252 }
1253 else
1254 *pfEFlags &= ~X86_EFL_CF;
1255}
1256
1257# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1258
1259IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1260{
1261 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1262 logical operation (AND/OR/whatever). */
1263 Assert(uSrc < 32);
1264 uint32_t fMask = RT_BIT_32(uSrc);
1265 uint32_t uDst = *puDst;
1266 if (uDst & fMask)
1267 {
1268 uDst &= ~fMask;
1269 *puDst = uDst;
1270 *pfEFlags |= X86_EFL_CF;
1271 }
1272 else
1273 *pfEFlags &= ~X86_EFL_CF;
1274}
1275
1276
1277IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1278{
1279 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1280 logical operation (AND/OR/whatever). */
1281 Assert(uSrc < 16);
1282 uint16_t fMask = RT_BIT_32(uSrc);
1283 uint16_t uDst = *puDst;
1284 if (uDst & fMask)
1285 {
1286 uDst &= ~fMask;
1287 *puDst = uDst;
1288 *pfEFlags |= X86_EFL_CF;
1289 }
1290 else
1291 *pfEFlags &= ~X86_EFL_CF;
1292}
1293
1294# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1295
1296/*
1297 * BTS
1298 */
1299
1300IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1301{
1302 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1303 logical operation (AND/OR/whatever). */
1304 Assert(uSrc < 64);
1305 uint64_t fMask = RT_BIT_64(uSrc);
1306 uint64_t uDst = *puDst;
1307 if (uDst & fMask)
1308 *pfEFlags |= X86_EFL_CF;
1309 else
1310 {
1311 uDst |= fMask;
1312 *puDst = uDst;
1313 *pfEFlags &= ~X86_EFL_CF;
1314 }
1315}
1316
1317# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1318
1319IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1320{
1321 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1322 logical operation (AND/OR/whatever). */
1323 Assert(uSrc < 32);
1324 uint32_t fMask = RT_BIT_32(uSrc);
1325 uint32_t uDst = *puDst;
1326 if (uDst & fMask)
1327 *pfEFlags |= X86_EFL_CF;
1328 else
1329 {
1330 uDst |= fMask;
1331 *puDst = uDst;
1332 *pfEFlags &= ~X86_EFL_CF;
1333 }
1334}
1335
1336
1337IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1338{
1339 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1340 logical operation (AND/OR/whatever). */
1341 Assert(uSrc < 16);
1342 uint16_t fMask = RT_BIT_32(uSrc);
1343 uint32_t uDst = *puDst;
1344 if (uDst & fMask)
1345 *pfEFlags |= X86_EFL_CF;
1346 else
1347 {
1348 uDst |= fMask;
1349 *puDst = uDst;
1350 *pfEFlags &= ~X86_EFL_CF;
1351 }
1352}
1353
1354# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1355
1356
1357EMIT_LOCKED_BIN_OP(btc, 64)
1358EMIT_LOCKED_BIN_OP(btr, 64)
1359EMIT_LOCKED_BIN_OP(bts, 64)
1360# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1361EMIT_LOCKED_BIN_OP(btc, 32)
1362EMIT_LOCKED_BIN_OP(btr, 32)
1363EMIT_LOCKED_BIN_OP(bts, 32)
1364
1365EMIT_LOCKED_BIN_OP(btc, 16)
1366EMIT_LOCKED_BIN_OP(btr, 16)
1367EMIT_LOCKED_BIN_OP(bts, 16)
1368# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1369
1370
1371/*
1372 * Helpers for BSR and BSF.
1373 *
1374 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1375 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1376 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1377 * but we restrict ourselves to emulating these recent marchs.
1378 */
1379#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1380 unsigned iBit = (a_iBit); \
1381 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1382 if (iBit) \
1383 { \
1384 *puDst = --iBit; \
1385 fEfl |= g_afParity[iBit]; \
1386 } \
1387 else \
1388 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1389 *pfEFlags = fEfl; \
1390 } while (0)
1391#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1392 unsigned const iBit = (a_iBit); \
1393 if (iBit) \
1394 { \
1395 *puDst = iBit - 1; \
1396 *pfEFlags &= ~X86_EFL_ZF; \
1397 } \
1398 else \
1399 *pfEFlags |= X86_EFL_ZF; \
1400 } while (0)
1401
1402
1403/*
1404 * BSF - first (least significant) bit set
1405 */
1406IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1407{
1408 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1409}
1410
1411IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1412{
1413 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1414}
1415
1416IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1417{
1418 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1419}
1420
1421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1422
1423IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1424{
1425 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1426}
1427
1428IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1429{
1430 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1431}
1432
1433IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1434{
1435 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1436}
1437
1438
1439IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1440{
1441 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1442}
1443
1444IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1445{
1446 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1447}
1448
1449IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1450{
1451 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1452}
1453
1454# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1455
1456
1457/*
1458 * BSR - last (most significant) bit set
1459 */
1460IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1461{
1462 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1463}
1464
1465IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1466{
1467 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1468}
1469
1470IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1471{
1472 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1473}
1474
1475# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1476
1477IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1478{
1479 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1480}
1481
1482IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1483{
1484 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1485}
1486
1487IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1488{
1489 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1490}
1491
1492
1493IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1494{
1495 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1496}
1497
1498IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1499{
1500 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1501}
1502
1503IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1504{
1505 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1506}
1507
1508# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1509
1510
1511/*
1512 * Helpers for LZCNT and TZCNT.
1513 */
1514#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1515 unsigned const uResult = (a_uResult); \
1516 *(a_puDst) = uResult; \
1517 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1518 if (uResult) \
1519 fEfl |= g_afParity[uResult]; \
1520 else \
1521 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1522 if (!a_uSrc) \
1523 fEfl |= X86_EFL_CF; \
1524 *(a_pfEFlags) = fEfl; \
1525 } while (0)
1526#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1527 unsigned const uResult = (a_uResult); \
1528 *(a_puDst) = uResult; \
1529 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1530 if (!uResult) \
1531 fEfl |= X86_EFL_ZF; \
1532 if (!a_uSrc) \
1533 fEfl |= X86_EFL_CF; \
1534 *(a_pfEFlags) = fEfl; \
1535 } while (0)
1536
1537
1538/*
1539 * LZCNT - count leading zero bits.
1540 */
1541IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1542{
1543 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1544}
1545
1546IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1547{
1548 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1549}
1550
1551IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1552{
1553 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1554}
1555
1556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1557
1558IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1559{
1560 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1561}
1562
1563IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1564{
1565 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1566}
1567
1568IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1569{
1570 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1571}
1572
1573
1574IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1575{
1576 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1577}
1578
1579IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1580{
1581 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1582}
1583
1584IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1585{
1586 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1587}
1588
1589# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1590
1591
1592/*
1593 * TZCNT - count leading zero bits.
1594 */
1595IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1596{
1597 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1598}
1599
1600IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1601{
1602 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1603}
1604
1605IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1606{
1607 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1608}
1609
1610# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1611
1612IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1613{
1614 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1615}
1616
1617IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1618{
1619 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1620}
1621
1622IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1623{
1624 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1625}
1626
1627
1628IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1629{
1630 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1631}
1632
1633IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1634{
1635 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1636}
1637
1638IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1639{
1640 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1641}
1642
1643# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1644#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1645
1646/*
1647 * BEXTR (BMI1 instruction)
1648 */
1649#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1650IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1651 a_Type uSrc2, uint32_t *pfEFlags)) \
1652{ \
1653 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1654 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1655 a_Type uResult; \
1656 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1657 if (iFirstBit < a_cBits) \
1658 { \
1659 uResult = uSrc1 >> iFirstBit; \
1660 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1661 if (cBits < a_cBits) \
1662 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1663 *puDst = uResult; \
1664 if (!uResult) \
1665 fEfl |= X86_EFL_ZF; \
1666 } \
1667 else \
1668 { \
1669 *puDst = uResult = 0; \
1670 fEfl |= X86_EFL_ZF; \
1671 } \
1672 /** @todo complete flag calculations. */ \
1673 *pfEFlags = fEfl; \
1674}
1675
1676EMIT_BEXTR(64, uint64_t, _fallback)
1677EMIT_BEXTR(32, uint32_t, _fallback)
1678#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1679EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1680#endif
1681#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1682EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1683#endif
1684
1685/*
1686 * BLSR (BMI1 instruction)
1687 */
1688#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1689IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1690{ \
1691 uint32_t fEfl1 = *pfEFlags; \
1692 uint32_t fEfl2 = fEfl1; \
1693 *puDst = uSrc; \
1694 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1695 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1696 \
1697 /* AMD: The carry flag is from the SUB operation. */ \
1698 /* 10890xe: PF always cleared? */ \
1699 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1700 fEfl2 |= fEfl1 & X86_EFL_CF; \
1701 *pfEFlags = fEfl2; \
1702}
1703
1704EMIT_BLSR(64, uint64_t, _fallback)
1705EMIT_BLSR(32, uint32_t, _fallback)
1706#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BLSR(64, uint64_t, RT_NOTHING)
1708#endif
1709#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1710EMIT_BLSR(32, uint32_t, RT_NOTHING)
1711#endif
1712
1713/*
1714 * BLSMSK (BMI1 instruction)
1715 */
1716#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1717IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1718{ \
1719 uint32_t fEfl1 = *pfEFlags; \
1720 uint32_t fEfl2 = fEfl1; \
1721 *puDst = uSrc; \
1722 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1723 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1724 \
1725 /* AMD: The carry flag is from the SUB operation. */ \
1726 /* 10890xe: PF always cleared? */ \
1727 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1728 fEfl2 |= fEfl1 & X86_EFL_CF; \
1729 *pfEFlags = fEfl2; \
1730}
1731
1732EMIT_BLSMSK(64, uint64_t, _fallback)
1733EMIT_BLSMSK(32, uint32_t, _fallback)
1734#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1736#endif
1737#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1738EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1739#endif
1740
1741/*
1742 * BLSI (BMI1 instruction)
1743 */
1744#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1745IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1746{ \
1747 uint32_t fEfl1 = *pfEFlags; \
1748 uint32_t fEfl2 = fEfl1; \
1749 *puDst = uSrc; \
1750 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1751 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1752 \
1753 /* AMD: The carry flag is from the SUB operation. */ \
1754 /* 10890xe: PF always cleared? */ \
1755 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1756 fEfl2 |= fEfl1 & X86_EFL_CF; \
1757 *pfEFlags = fEfl2; \
1758}
1759
1760EMIT_BLSI(64, uint64_t, _fallback)
1761EMIT_BLSI(32, uint32_t, _fallback)
1762#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSI(64, uint64_t, RT_NOTHING)
1764#endif
1765#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1766EMIT_BLSI(32, uint32_t, RT_NOTHING)
1767#endif
1768
1769/*
1770 * BZHI (BMI2 instruction)
1771 */
1772#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1773IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1774 a_Type uSrc2, uint32_t *pfEFlags)) \
1775{ \
1776 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1777 a_Type uResult; \
1778 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1779 if (iFirstBit < a_cBits) \
1780 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1781 else \
1782 { \
1783 uResult = uSrc1; \
1784 fEfl |= X86_EFL_CF; \
1785 } \
1786 *puDst = uResult; \
1787 fEfl |= X86_EFL_CALC_ZF(uResult); \
1788 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1789 *pfEFlags = fEfl; \
1790}
1791
1792EMIT_BZHI(64, uint64_t, _fallback)
1793EMIT_BZHI(32, uint32_t, _fallback)
1794#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1795EMIT_BZHI(64, uint64_t, RT_NOTHING)
1796#endif
1797#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1798EMIT_BZHI(32, uint32_t, RT_NOTHING)
1799#endif
1800
1801/*
1802 * POPCNT
1803 */
1804RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1805{
1806 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1807 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1808 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1809 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1810};
1811
1812/** @todo Use native popcount where possible and employ some more efficient
1813 * algorithm here (or in asm.h fallback)! */
1814
1815DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1816{
1817 return g_abBitCounts6[ u16 & 0x3f]
1818 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1819 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1820}
1821
1822DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1823{
1824 return g_abBitCounts6[ u32 & 0x3f]
1825 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1826 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1827 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1828 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1829 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1830}
1831
1832DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1833{
1834 return g_abBitCounts6[ u64 & 0x3f]
1835 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1836 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1837 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1838 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1839 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1840 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1841 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1842 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1843 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1844 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1845}
1846
1847#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1848IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1849{ \
1850 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1851 a_Type uResult; \
1852 if (uSrc) \
1853 uResult = iemPopCountU ## a_cBits(uSrc); \
1854 else \
1855 { \
1856 fEfl |= X86_EFL_ZF; \
1857 uResult = 0; \
1858 } \
1859 *puDst = uResult; \
1860 *pfEFlags = fEfl; \
1861}
1862
1863EMIT_POPCNT(64, uint64_t, _fallback)
1864EMIT_POPCNT(32, uint32_t, _fallback)
1865EMIT_POPCNT(16, uint16_t, _fallback)
1866#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1867EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1868#endif
1869#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1870EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1871EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1872#endif
1873
1874
1875#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1876
1877/*
1878 * XCHG
1879 */
1880
1881IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1882{
1883#if ARCH_BITS >= 64
1884 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1885#else
1886 uint64_t uOldMem = *puMem;
1887 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1888 ASMNopPause();
1889 *puReg = uOldMem;
1890#endif
1891}
1892
1893# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1894
1895IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1896{
1897 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1898}
1899
1900
1901IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1902{
1903 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1904}
1905
1906
1907IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1908{
1909 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1910}
1911
1912# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1913
1914
1915/* Unlocked variants for fDisregardLock mode: */
1916
1917IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1918{
1919 uint64_t const uOld = *puMem;
1920 *puMem = *puReg;
1921 *puReg = uOld;
1922}
1923
1924# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1927{
1928 uint32_t const uOld = *puMem;
1929 *puMem = *puReg;
1930 *puReg = uOld;
1931}
1932
1933
1934IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1935{
1936 uint16_t const uOld = *puMem;
1937 *puMem = *puReg;
1938 *puReg = uOld;
1939}
1940
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1943{
1944 uint8_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1950
1951
1952/*
1953 * XADD and LOCK XADD.
1954 */
1955#define EMIT_XADD(a_cBitsWidth, a_Type) \
1956IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1957{ \
1958 a_Type uDst = *puDst; \
1959 a_Type uResult = uDst; \
1960 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1961 *puDst = uResult; \
1962 *puReg = uDst; \
1963} \
1964\
1965IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1966{ \
1967 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1968 a_Type uResult; \
1969 uint32_t fEflTmp; \
1970 do \
1971 { \
1972 uResult = uOld; \
1973 fEflTmp = *pfEFlags; \
1974 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
1975 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
1976 *puReg = uOld; \
1977 *pfEFlags = fEflTmp; \
1978}
1979EMIT_XADD(64, uint64_t)
1980# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1981EMIT_XADD(32, uint32_t)
1982EMIT_XADD(16, uint16_t)
1983EMIT_XADD(8, uint8_t)
1984# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1985
1986#endif
1987
1988/*
1989 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
1990 *
1991 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
1992 * instructions are emulated as locked.
1993 */
1994#if defined(IEM_WITHOUT_ASSEMBLY)
1995
1996IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
1997{
1998 uint8_t uOld = *puAl;
1999 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2000 Assert(*puAl == uOld);
2001 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2002}
2003
2004
2005IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2006{
2007 uint16_t uOld = *puAx;
2008 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2009 Assert(*puAx == uOld);
2010 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2011}
2012
2013
2014IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2015{
2016 uint32_t uOld = *puEax;
2017 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2018 Assert(*puEax == uOld);
2019 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2020}
2021
2022
2023# if ARCH_BITS == 32
2024IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2025# else
2026IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2027# endif
2028{
2029# if ARCH_BITS == 32
2030 uint64_t const uSrcReg = *puSrcReg;
2031# endif
2032 uint64_t uOld = *puRax;
2033 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2034 Assert(*puRax == uOld);
2035 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2040 uint32_t *pEFlags))
2041{
2042 uint64_t const uNew = pu64EbxEcx->u;
2043 uint64_t const uOld = pu64EaxEdx->u;
2044 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2045 {
2046 Assert(pu64EaxEdx->u == uOld);
2047 *pEFlags |= X86_EFL_ZF;
2048 }
2049 else
2050 *pEFlags &= ~X86_EFL_ZF;
2051}
2052
2053
2054# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2055IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2056 uint32_t *pEFlags))
2057{
2058# ifdef VBOX_STRICT
2059 RTUINT128U const uOld = *pu128RaxRdx;
2060# endif
2061# if defined(RT_ARCH_AMD64)
2062 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2063 &pu128RaxRdx->u))
2064# else
2065 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2066# endif
2067 {
2068 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2069 *pEFlags |= X86_EFL_ZF;
2070 }
2071 else
2072 *pEFlags &= ~X86_EFL_ZF;
2073}
2074# endif
2075
2076#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2077
2078# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2079IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2080 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2081{
2082 RTUINT128U u128Tmp = *pu128Dst;
2083 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2084 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2085 {
2086 *pu128Dst = *pu128RbxRcx;
2087 *pEFlags |= X86_EFL_ZF;
2088 }
2089 else
2090 {
2091 *pu128RaxRdx = u128Tmp;
2092 *pEFlags &= ~X86_EFL_ZF;
2093 }
2094}
2095#endif /* !RT_ARCH_ARM64 */
2096
2097#if defined(IEM_WITHOUT_ASSEMBLY)
2098
2099/* Unlocked versions mapped to the locked ones: */
2100
2101IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2102{
2103 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2104}
2105
2106
2107IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2108{
2109 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2110}
2111
2112
2113IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2114{
2115 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2116}
2117
2118
2119# if ARCH_BITS == 32
2120IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2121{
2122 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2123}
2124# else
2125IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2126{
2127 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2128}
2129# endif
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2139 uint32_t *pEFlags))
2140{
2141 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2142}
2143
2144#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2145
2146#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2147 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2148
2149/*
2150 * MUL, IMUL, DIV and IDIV helpers.
2151 *
2152 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2153 * division step so we can select between using C operators and
2154 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2155 *
2156 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2157 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2158 * input loads and the result storing.
2159 */
2160
2161DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2162{
2163# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2164 pQuotient->s.Lo = 0;
2165 pQuotient->s.Hi = 0;
2166# endif
2167 RTUINT128U Divisor;
2168 Divisor.s.Lo = u64Divisor;
2169 Divisor.s.Hi = 0;
2170 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2171}
2172
2173# define DIV_LOAD(a_Dividend) \
2174 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2175# define DIV_LOAD_U8(a_Dividend) \
2176 a_Dividend.u = *puAX
2177
2178# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2179# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2180
2181# define MUL_LOAD_F1() *puA
2182# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2183
2184# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2185# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2186
2187# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2188 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2189# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2190 RTUInt128AssignNeg(&(a_Value))
2191
2192# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2193 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2194# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2195 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2196
2197# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2198 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2199 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2200# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2201 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2202
2203
2204/*
2205 * MUL
2206 */
2207# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2208IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2209{ \
2210 RTUINT ## a_cBitsWidth2x ## U Result; \
2211 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2212 a_fnStore(Result); \
2213 \
2214 /* Calc EFLAGS: */ \
2215 uint32_t fEfl = *pfEFlags; \
2216 if (a_fIntelFlags) \
2217 { /* Intel: 6700K and 10980XE behavior */ \
2218 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2219 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2220 fEfl |= X86_EFL_SF; \
2221 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2222 if (Result.s.Hi != 0) \
2223 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2224 } \
2225 else \
2226 { /* AMD: 3990X */ \
2227 if (Result.s.Hi != 0) \
2228 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2229 else \
2230 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2231 } \
2232 *pfEFlags = fEfl; \
2233 return 0; \
2234} \
2235
2236# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2237 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2238 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2239 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2240
2241# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2242EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2243 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2244# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2245EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2246 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2247EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2248 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2249EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2250 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2251# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2252# endif /* !DOXYGEN_RUNNING */
2253
2254/*
2255 * MULX
2256 */
2257# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2258IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2259 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2260{ \
2261 RTUINT ## a_cBitsWidth2x ## U Result; \
2262 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2263 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2264 *puDst1 = Result.s.Hi; \
2265} \
2266
2267# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2268EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2269EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2270# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2271EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2272EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2273# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2274# endif /* !DOXYGEN_RUNNING */
2275
2276
2277/*
2278 * IMUL
2279 *
2280 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2281 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2282 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2283 */
2284# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2285 a_Suffix, a_fIntelFlags) \
2286IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2287{ \
2288 RTUINT ## a_cBitsWidth2x ## U Result; \
2289 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2290 \
2291 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2292 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2293 { \
2294 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2295 { \
2296 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2297 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2298 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2299 } \
2300 else \
2301 { \
2302 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2303 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2304 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2305 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2306 a_fnNeg(Result, a_cBitsWidth2x); \
2307 } \
2308 } \
2309 else \
2310 { \
2311 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2312 { \
2313 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2314 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2315 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2316 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2317 a_fnNeg(Result, a_cBitsWidth2x); \
2318 } \
2319 else \
2320 { \
2321 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2322 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2323 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2324 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2325 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2326 } \
2327 } \
2328 a_fnStore(Result); \
2329 \
2330 if (a_fIntelFlags) \
2331 { \
2332 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2333 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2334 fEfl |= X86_EFL_SF; \
2335 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2336 } \
2337 *pfEFlags = fEfl; \
2338 return 0; \
2339}
2340# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2341 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2342 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2343 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2344
2345# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2346EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2347 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2348# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2349EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2350 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2351EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2352 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2353EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2354 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2355# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2356# endif /* !DOXYGEN_RUNNING */
2357
2358
2359/*
2360 * IMUL with two operands are mapped onto the three operand variant, ignoring
2361 * the high part of the product.
2362 */
2363# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2364IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2365{ \
2366 a_uType uIgn; \
2367 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2368} \
2369\
2370IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2371{ \
2372 a_uType uIgn; \
2373 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2374} \
2375\
2376IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2377{ \
2378 a_uType uIgn; \
2379 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2380}
2381
2382EMIT_IMUL_TWO(64, uint64_t)
2383# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2384EMIT_IMUL_TWO(32, uint32_t)
2385EMIT_IMUL_TWO(16, uint16_t)
2386# endif
2387
2388
2389/*
2390 * DIV
2391 */
2392# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2393 a_Suffix, a_fIntelFlags) \
2394IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2395{ \
2396 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2397 a_fnLoad(Dividend); \
2398 if ( uDivisor != 0 \
2399 && Dividend.s.Hi < uDivisor) \
2400 { \
2401 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2402 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2403 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2404 \
2405 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2406 if (!a_fIntelFlags) \
2407 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2408 return 0; \
2409 } \
2410 /* #DE */ \
2411 return -1; \
2412}
2413# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2414 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2415 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2416 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2417
2418# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2419EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2420 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2422EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2423 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2424EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2425 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2426EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2427 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2428# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2429# endif /* !DOXYGEN_RUNNING */
2430
2431
2432/*
2433 * IDIV
2434 *
2435 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2436 * set AF and clear PF, ZF and SF just like it does for DIV.
2437 *
2438 */
2439# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2440 a_Suffix, a_fIntelFlags) \
2441IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2442{ \
2443 /* Note! Skylake leaves all flags alone. */ \
2444 \
2445 /** @todo overflow checks */ \
2446 if (uDivisor != 0) \
2447 { \
2448 /* \
2449 * Convert to unsigned division. \
2450 */ \
2451 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2452 a_fnLoad(Dividend); \
2453 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2454 if (fSignedDividend) \
2455 a_fnNeg(Dividend, a_cBitsWidth2x); \
2456 \
2457 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2458 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2459 uDivisorPositive = uDivisor; \
2460 else \
2461 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2462 \
2463 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2464 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2465 \
2466 /* \
2467 * Setup the result, checking for overflows. \
2468 */ \
2469 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2470 { \
2471 if (!fSignedDividend) \
2472 { \
2473 /* Positive divisor, positive dividend => result positive. */ \
2474 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2475 { \
2476 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2477 if (!a_fIntelFlags) \
2478 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2479 return 0; \
2480 } \
2481 } \
2482 else \
2483 { \
2484 /* Positive divisor, negative dividend => result negative. */ \
2485 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2486 { \
2487 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2488 if (!a_fIntelFlags) \
2489 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2490 return 0; \
2491 } \
2492 } \
2493 } \
2494 else \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2500 { \
2501 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2511 { \
2512 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 } \
2520 /* #DE */ \
2521 return -1; \
2522}
2523# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2524 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2525 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2526 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2527
2528# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2529EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2530 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2531# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2532EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2533 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2534EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2535 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2536EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2537 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2539# endif /* !DOXYGEN_RUNNING */
2540
2541#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2542
2543
2544/*********************************************************************************************************************************
2545* Unary operations. *
2546*********************************************************************************************************************************/
2547#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2548
2549/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2550 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2551 *
2552 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2553 * borrowing in arithmetic loops on intel 8008).
2554 *
2555 * @returns Status bits.
2556 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2557 * @param a_uResult Unsigned result value.
2558 * @param a_uDst The original destination value (for AF calc).
2559 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2560 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2561 */
2562#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2563 do { \
2564 uint32_t fEflTmp = *(a_pfEFlags); \
2565 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2566 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2567 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2568 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2569 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2570 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2571 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2572 *(a_pfEFlags) = fEflTmp; \
2573 } while (0)
2574
2575/*
2576 * INC
2577 */
2578
2579IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2580{
2581 uint64_t uDst = *puDst;
2582 uint64_t uResult = uDst + 1;
2583 *puDst = uResult;
2584 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2585}
2586
2587# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2588
2589IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2590{
2591 uint32_t uDst = *puDst;
2592 uint32_t uResult = uDst + 1;
2593 *puDst = uResult;
2594 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2595}
2596
2597
2598IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2599{
2600 uint16_t uDst = *puDst;
2601 uint16_t uResult = uDst + 1;
2602 *puDst = uResult;
2603 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2604}
2605
2606IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2607{
2608 uint8_t uDst = *puDst;
2609 uint8_t uResult = uDst + 1;
2610 *puDst = uResult;
2611 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2612}
2613
2614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2615
2616
2617/*
2618 * DEC
2619 */
2620
2621IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2622{
2623 uint64_t uDst = *puDst;
2624 uint64_t uResult = uDst - 1;
2625 *puDst = uResult;
2626 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2627}
2628
2629# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint32_t uDst = *puDst;
2634 uint32_t uResult = uDst - 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2637}
2638
2639
2640IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2641{
2642 uint16_t uDst = *puDst;
2643 uint16_t uResult = uDst - 1;
2644 *puDst = uResult;
2645 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2646}
2647
2648
2649IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2650{
2651 uint8_t uDst = *puDst;
2652 uint8_t uResult = uDst - 1;
2653 *puDst = uResult;
2654 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2655}
2656
2657# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2658
2659
2660/*
2661 * NOT
2662 */
2663
2664IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2665{
2666 uint64_t uDst = *puDst;
2667 uint64_t uResult = ~uDst;
2668 *puDst = uResult;
2669 /* EFLAGS are not modified. */
2670 RT_NOREF_PV(pfEFlags);
2671}
2672
2673# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2674
2675IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2676{
2677 uint32_t uDst = *puDst;
2678 uint32_t uResult = ~uDst;
2679 *puDst = uResult;
2680 /* EFLAGS are not modified. */
2681 RT_NOREF_PV(pfEFlags);
2682}
2683
2684IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2685{
2686 uint16_t uDst = *puDst;
2687 uint16_t uResult = ~uDst;
2688 *puDst = uResult;
2689 /* EFLAGS are not modified. */
2690 RT_NOREF_PV(pfEFlags);
2691}
2692
2693IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2694{
2695 uint8_t uDst = *puDst;
2696 uint8_t uResult = ~uDst;
2697 *puDst = uResult;
2698 /* EFLAGS are not modified. */
2699 RT_NOREF_PV(pfEFlags);
2700}
2701
2702# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2703
2704
2705/*
2706 * NEG
2707 */
2708
2709/**
2710 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2711 *
2712 * @returns Status bits.
2713 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2714 * @param a_uResult Unsigned result value.
2715 * @param a_uDst The original destination value (for AF calc).
2716 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2717 */
2718#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2719 do { \
2720 uint32_t fEflTmp = *(a_pfEFlags); \
2721 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2722 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2723 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2724 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2725 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2726 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2727 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2728 *(a_pfEFlags) = fEflTmp; \
2729 } while (0)
2730
2731IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2732{
2733 uint64_t uDst = *puDst;
2734 uint64_t uResult = (uint64_t)0 - uDst;
2735 *puDst = uResult;
2736 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2737}
2738
2739# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2740
2741IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2742{
2743 uint32_t uDst = *puDst;
2744 uint32_t uResult = (uint32_t)0 - uDst;
2745 *puDst = uResult;
2746 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2747}
2748
2749
2750IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2751{
2752 uint16_t uDst = *puDst;
2753 uint16_t uResult = (uint16_t)0 - uDst;
2754 *puDst = uResult;
2755 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2756}
2757
2758
2759IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2760{
2761 uint8_t uDst = *puDst;
2762 uint8_t uResult = (uint8_t)0 - uDst;
2763 *puDst = uResult;
2764 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2765}
2766
2767# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2768
2769/*
2770 * Locked variants.
2771 */
2772
2773/** Emit a function for doing a locked unary operand operation. */
2774# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2775 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2776 uint32_t *pfEFlags)) \
2777 { \
2778 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2779 uint ## a_cBitsWidth ## _t uTmp; \
2780 uint32_t fEflTmp; \
2781 do \
2782 { \
2783 uTmp = uOld; \
2784 fEflTmp = *pfEFlags; \
2785 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2786 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2787 *pfEFlags = fEflTmp; \
2788 }
2789
2790EMIT_LOCKED_UNARY_OP(inc, 64)
2791EMIT_LOCKED_UNARY_OP(dec, 64)
2792EMIT_LOCKED_UNARY_OP(not, 64)
2793EMIT_LOCKED_UNARY_OP(neg, 64)
2794# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2795EMIT_LOCKED_UNARY_OP(inc, 32)
2796EMIT_LOCKED_UNARY_OP(dec, 32)
2797EMIT_LOCKED_UNARY_OP(not, 32)
2798EMIT_LOCKED_UNARY_OP(neg, 32)
2799
2800EMIT_LOCKED_UNARY_OP(inc, 16)
2801EMIT_LOCKED_UNARY_OP(dec, 16)
2802EMIT_LOCKED_UNARY_OP(not, 16)
2803EMIT_LOCKED_UNARY_OP(neg, 16)
2804
2805EMIT_LOCKED_UNARY_OP(inc, 8)
2806EMIT_LOCKED_UNARY_OP(dec, 8)
2807EMIT_LOCKED_UNARY_OP(not, 8)
2808EMIT_LOCKED_UNARY_OP(neg, 8)
2809# endif
2810
2811#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2812
2813
2814/*********************************************************************************************************************************
2815* Shifting and Rotating *
2816*********************************************************************************************************************************/
2817
2818/*
2819 * ROL
2820 */
2821#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2822IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2823{ \
2824 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2825 if (cShift) \
2826 { \
2827 if (a_cBitsWidth < 32) \
2828 cShift &= a_cBitsWidth - 1; \
2829 a_uType const uDst = *puDst; \
2830 a_uType const uResult = a_fnHlp(uDst, cShift); \
2831 *puDst = uResult; \
2832 \
2833 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2834 it the same way as for 1 bit shifts. */ \
2835 AssertCompile(X86_EFL_CF_BIT == 0); \
2836 uint32_t fEfl = *pfEFlags; \
2837 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2838 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2839 fEfl |= fCarry; \
2840 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2841 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2842 else /* Intel 10980XE: According to the first sub-shift: */ \
2843 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2844 *pfEFlags = fEfl; \
2845 } \
2846}
2847
2848#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2849EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2850#endif
2851EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2852EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2853
2854#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2855EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2856#endif
2857EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2858EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2859
2860DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2861{
2862 return (uValue << cShift) | (uValue >> (16 - cShift));
2863}
2864#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2865EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2866#endif
2867EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2868EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2869
2870DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2871{
2872 return (uValue << cShift) | (uValue >> (8 - cShift));
2873}
2874#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2875EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2876#endif
2877EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2878EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2879
2880
2881/*
2882 * ROR
2883 */
2884#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2885IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2886{ \
2887 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2888 if (cShift) \
2889 { \
2890 if (a_cBitsWidth < 32) \
2891 cShift &= a_cBitsWidth - 1; \
2892 a_uType const uDst = *puDst; \
2893 a_uType const uResult = a_fnHlp(uDst, cShift); \
2894 *puDst = uResult; \
2895 \
2896 /* Calc EFLAGS: */ \
2897 AssertCompile(X86_EFL_CF_BIT == 0); \
2898 uint32_t fEfl = *pfEFlags; \
2899 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2900 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2901 fEfl |= fCarry; \
2902 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2903 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2904 else /* Intel 10980XE: According to the first sub-shift: */ \
2905 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2906 *pfEFlags = fEfl; \
2907 } \
2908}
2909
2910#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2911EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2912#endif
2913EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2914EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2915
2916#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2917EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2918#endif
2919EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2920EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2921
2922DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2923{
2924 return (uValue >> cShift) | (uValue << (16 - cShift));
2925}
2926#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2927EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2928#endif
2929EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2930EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2931
2932DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2933{
2934 return (uValue >> cShift) | (uValue << (8 - cShift));
2935}
2936#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2937EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2938#endif
2939EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2940EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2941
2942
2943/*
2944 * RCL
2945 */
2946#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2947IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2948{ \
2949 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2950 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2951 cShift %= a_cBitsWidth + 1; \
2952 if (cShift) \
2953 { \
2954 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2955 cShift %= a_cBitsWidth + 1; \
2956 a_uType const uDst = *puDst; \
2957 a_uType uResult = uDst << cShift; \
2958 if (cShift > 1) \
2959 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2960 \
2961 AssertCompile(X86_EFL_CF_BIT == 0); \
2962 uint32_t fEfl = *pfEFlags; \
2963 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2964 uResult |= (a_uType)fInCarry << (cShift - 1); \
2965 \
2966 *puDst = uResult; \
2967 \
2968 /* Calc EFLAGS. */ \
2969 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2970 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2971 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2972 fEfl |= fOutCarry; \
2973 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2974 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
2975 else /* Intel 10980XE: According to the first sub-shift: */ \
2976 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2977 *pfEFlags = fEfl; \
2978 } \
2979}
2980
2981#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2982EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
2983#endif
2984EMIT_RCL(64, uint64_t, _intel, 1)
2985EMIT_RCL(64, uint64_t, _amd, 0)
2986
2987#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2988EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
2989#endif
2990EMIT_RCL(32, uint32_t, _intel, 1)
2991EMIT_RCL(32, uint32_t, _amd, 0)
2992
2993#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2994EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
2995#endif
2996EMIT_RCL(16, uint16_t, _intel, 1)
2997EMIT_RCL(16, uint16_t, _amd, 0)
2998
2999#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3000EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3001#endif
3002EMIT_RCL(8, uint8_t, _intel, 1)
3003EMIT_RCL(8, uint8_t, _amd, 0)
3004
3005
3006/*
3007 * RCR
3008 */
3009#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3010IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3011{ \
3012 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3013 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3014 cShift %= a_cBitsWidth + 1; \
3015 if (cShift) \
3016 { \
3017 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3018 cShift %= a_cBitsWidth + 1; \
3019 a_uType const uDst = *puDst; \
3020 a_uType uResult = uDst >> cShift; \
3021 if (cShift > 1) \
3022 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3023 \
3024 AssertCompile(X86_EFL_CF_BIT == 0); \
3025 uint32_t fEfl = *pfEFlags; \
3026 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3027 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3028 *puDst = uResult; \
3029 \
3030 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3031 it the same way as for 1 bit shifts. */ \
3032 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3033 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3034 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3035 fEfl |= fOutCarry; \
3036 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3037 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3038 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3039 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3040 *pfEFlags = fEfl; \
3041 } \
3042}
3043
3044#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3045EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3046#endif
3047EMIT_RCR(64, uint64_t, _intel, 1)
3048EMIT_RCR(64, uint64_t, _amd, 0)
3049
3050#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3051EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3052#endif
3053EMIT_RCR(32, uint32_t, _intel, 1)
3054EMIT_RCR(32, uint32_t, _amd, 0)
3055
3056#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3057EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3058#endif
3059EMIT_RCR(16, uint16_t, _intel, 1)
3060EMIT_RCR(16, uint16_t, _amd, 0)
3061
3062#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3063EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3064#endif
3065EMIT_RCR(8, uint8_t, _intel, 1)
3066EMIT_RCR(8, uint8_t, _amd, 0)
3067
3068
3069/*
3070 * SHL
3071 */
3072#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3073IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3074{ \
3075 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3076 if (cShift) \
3077 { \
3078 a_uType const uDst = *puDst; \
3079 a_uType uResult = uDst << cShift; \
3080 *puDst = uResult; \
3081 \
3082 /* Calc EFLAGS. */ \
3083 AssertCompile(X86_EFL_CF_BIT == 0); \
3084 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3085 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3086 fEfl |= fCarry; \
3087 if (!a_fIntelFlags) \
3088 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3089 else \
3090 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3091 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3092 fEfl |= X86_EFL_CALC_ZF(uResult); \
3093 fEfl |= g_afParity[uResult & 0xff]; \
3094 if (!a_fIntelFlags) \
3095 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3096 *pfEFlags = fEfl; \
3097 } \
3098}
3099
3100#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3101EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3102#endif
3103EMIT_SHL(64, uint64_t, _intel, 1)
3104EMIT_SHL(64, uint64_t, _amd, 0)
3105
3106#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3107EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3108#endif
3109EMIT_SHL(32, uint32_t, _intel, 1)
3110EMIT_SHL(32, uint32_t, _amd, 0)
3111
3112#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3113EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3114#endif
3115EMIT_SHL(16, uint16_t, _intel, 1)
3116EMIT_SHL(16, uint16_t, _amd, 0)
3117
3118#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3119EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3120#endif
3121EMIT_SHL(8, uint8_t, _intel, 1)
3122EMIT_SHL(8, uint8_t, _amd, 0)
3123
3124
3125/*
3126 * SHR
3127 */
3128#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3129IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3130{ \
3131 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3132 if (cShift) \
3133 { \
3134 a_uType const uDst = *puDst; \
3135 a_uType uResult = uDst >> cShift; \
3136 *puDst = uResult; \
3137 \
3138 /* Calc EFLAGS. */ \
3139 AssertCompile(X86_EFL_CF_BIT == 0); \
3140 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3141 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3142 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3143 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3144 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3145 fEfl |= X86_EFL_CALC_ZF(uResult); \
3146 fEfl |= g_afParity[uResult & 0xff]; \
3147 if (!a_fIntelFlags) \
3148 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3149 *pfEFlags = fEfl; \
3150 } \
3151}
3152
3153#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3154EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3155#endif
3156EMIT_SHR(64, uint64_t, _intel, 1)
3157EMIT_SHR(64, uint64_t, _amd, 0)
3158
3159#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3160EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3161#endif
3162EMIT_SHR(32, uint32_t, _intel, 1)
3163EMIT_SHR(32, uint32_t, _amd, 0)
3164
3165#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3166EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3167#endif
3168EMIT_SHR(16, uint16_t, _intel, 1)
3169EMIT_SHR(16, uint16_t, _amd, 0)
3170
3171#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3172EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3173#endif
3174EMIT_SHR(8, uint8_t, _intel, 1)
3175EMIT_SHR(8, uint8_t, _amd, 0)
3176
3177
3178/*
3179 * SAR
3180 */
3181#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3182IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3183{ \
3184 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3185 if (cShift) \
3186 { \
3187 a_iType const iDst = (a_iType)*puDst; \
3188 a_uType uResult = iDst >> cShift; \
3189 *puDst = uResult; \
3190 \
3191 /* Calc EFLAGS. \
3192 Note! The OF flag is always zero because the result never differs from the input. */ \
3193 AssertCompile(X86_EFL_CF_BIT == 0); \
3194 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3195 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3196 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3197 fEfl |= X86_EFL_CALC_ZF(uResult); \
3198 fEfl |= g_afParity[uResult & 0xff]; \
3199 if (!a_fIntelFlags) \
3200 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3201 *pfEFlags = fEfl; \
3202 } \
3203}
3204
3205#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3206EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3207#endif
3208EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3209EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3210
3211#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3212EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3213#endif
3214EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3215EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3216
3217#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3218EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3219#endif
3220EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3221EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3222
3223#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3224EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3225#endif
3226EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3227EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3228
3229
3230/*
3231 * SHLD
3232 *
3233 * - CF is the last bit shifted out of puDst.
3234 * - AF is always cleared by Intel 10980XE.
3235 * - AF is always set by AMD 3990X.
3236 * - OF is set according to the first shift on Intel 10980XE, it seems.
3237 * - OF is set according to the last sub-shift on AMD 3990X.
3238 * - ZF, SF and PF are calculated according to the result by both vendors.
3239 *
3240 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3241 * pick either the source register or the destination register for input bits
3242 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3243 * intel has changed behaviour here several times. We implement what current
3244 * skylake based does for now, we can extend this later as needed.
3245 */
3246#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3247IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3248 uint32_t *pfEFlags)) \
3249{ \
3250 cShift &= a_cBitsWidth - 1; \
3251 if (cShift) \
3252 { \
3253 a_uType const uDst = *puDst; \
3254 a_uType uResult = uDst << cShift; \
3255 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3256 *puDst = uResult; \
3257 \
3258 /* CALC EFLAGS: */ \
3259 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3260 if (a_fIntelFlags) \
3261 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3262 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3263 else \
3264 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3265 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3266 fEfl |= X86_EFL_AF; \
3267 } \
3268 AssertCompile(X86_EFL_CF_BIT == 0); \
3269 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3270 fEfl |= g_afParity[uResult & 0xff]; \
3271 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3272 fEfl |= X86_EFL_CALC_ZF(uResult); \
3273 *pfEFlags = fEfl; \
3274 } \
3275}
3276
3277#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3278EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3279#endif
3280EMIT_SHLD(64, uint64_t, _intel, 1)
3281EMIT_SHLD(64, uint64_t, _amd, 0)
3282
3283#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3284EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3285#endif
3286EMIT_SHLD(32, uint32_t, _intel, 1)
3287EMIT_SHLD(32, uint32_t, _amd, 0)
3288
3289#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3290IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3291{ \
3292 cShift &= 31; \
3293 if (cShift) \
3294 { \
3295 uint16_t const uDst = *puDst; \
3296 uint64_t const uTmp = a_fIntelFlags \
3297 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3298 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3299 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3300 *puDst = uResult; \
3301 \
3302 /* CALC EFLAGS: */ \
3303 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3304 AssertCompile(X86_EFL_CF_BIT == 0); \
3305 if (a_fIntelFlags) \
3306 { \
3307 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3308 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3309 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3310 } \
3311 else \
3312 { \
3313 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3314 if (cShift < 16) \
3315 { \
3316 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3317 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3318 } \
3319 else \
3320 { \
3321 if (cShift == 16) \
3322 fEfl |= uDst & X86_EFL_CF; \
3323 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3324 } \
3325 fEfl |= X86_EFL_AF; \
3326 } \
3327 fEfl |= g_afParity[uResult & 0xff]; \
3328 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3329 fEfl |= X86_EFL_CALC_ZF(uResult); \
3330 *pfEFlags = fEfl; \
3331 } \
3332}
3333
3334#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3335EMIT_SHLD_16(RT_NOTHING, 1)
3336#endif
3337EMIT_SHLD_16(_intel, 1)
3338EMIT_SHLD_16(_amd, 0)
3339
3340
3341/*
3342 * SHRD
3343 *
3344 * EFLAGS behaviour seems to be the same as with SHLD:
3345 * - CF is the last bit shifted out of puDst.
3346 * - AF is always cleared by Intel 10980XE.
3347 * - AF is always set by AMD 3990X.
3348 * - OF is set according to the first shift on Intel 10980XE, it seems.
3349 * - OF is set according to the last sub-shift on AMD 3990X.
3350 * - ZF, SF and PF are calculated according to the result by both vendors.
3351 *
3352 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3353 * pick either the source register or the destination register for input bits
3354 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3355 * intel has changed behaviour here several times. We implement what current
3356 * skylake based does for now, we can extend this later as needed.
3357 */
3358#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3359IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3360{ \
3361 cShift &= a_cBitsWidth - 1; \
3362 if (cShift) \
3363 { \
3364 a_uType const uDst = *puDst; \
3365 a_uType uResult = uDst >> cShift; \
3366 uResult |= uSrc << (a_cBitsWidth - cShift); \
3367 *puDst = uResult; \
3368 \
3369 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3370 AssertCompile(X86_EFL_CF_BIT == 0); \
3371 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3372 if (a_fIntelFlags) \
3373 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3374 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3375 else \
3376 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3377 if (cShift > 1) /* Set according to last shift. */ \
3378 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3379 else \
3380 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3381 fEfl |= X86_EFL_AF; \
3382 } \
3383 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3384 fEfl |= X86_EFL_CALC_ZF(uResult); \
3385 fEfl |= g_afParity[uResult & 0xff]; \
3386 *pfEFlags = fEfl; \
3387 } \
3388}
3389
3390#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3391EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3392#endif
3393EMIT_SHRD(64, uint64_t, _intel, 1)
3394EMIT_SHRD(64, uint64_t, _amd, 0)
3395
3396#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3397EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3398#endif
3399EMIT_SHRD(32, uint32_t, _intel, 1)
3400EMIT_SHRD(32, uint32_t, _amd, 0)
3401
3402#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3403IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3404{ \
3405 cShift &= 31; \
3406 if (cShift) \
3407 { \
3408 uint16_t const uDst = *puDst; \
3409 uint64_t const uTmp = a_fIntelFlags \
3410 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3411 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3412 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3413 *puDst = uResult; \
3414 \
3415 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3416 AssertCompile(X86_EFL_CF_BIT == 0); \
3417 if (a_fIntelFlags) \
3418 { \
3419 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3420 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3421 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3422 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3423 } \
3424 else \
3425 { \
3426 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3427 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3428 /* AMD 3990X: Set according to last shift. AF always set. */ \
3429 if (cShift > 1) /* Set according to last shift. */ \
3430 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3431 else \
3432 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3433 fEfl |= X86_EFL_AF; \
3434 } \
3435 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3436 fEfl |= X86_EFL_CALC_ZF(uResult); \
3437 fEfl |= g_afParity[uResult & 0xff]; \
3438 *pfEFlags = fEfl; \
3439 } \
3440}
3441
3442#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3443EMIT_SHRD_16(RT_NOTHING, 1)
3444#endif
3445EMIT_SHRD_16(_intel, 1)
3446EMIT_SHRD_16(_amd, 0)
3447
3448
3449/*
3450 * RORX (BMI2)
3451 */
3452#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3453IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3454{ \
3455 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3456}
3457
3458#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3459EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3460#endif
3461#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3462EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3463#endif
3464
3465
3466/*
3467 * SHLX (BMI2)
3468 */
3469#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3470IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3471{ \
3472 cShift &= a_cBitsWidth - 1; \
3473 *puDst = uSrc << cShift; \
3474}
3475
3476#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3477EMIT_SHLX(64, uint64_t, RT_NOTHING)
3478EMIT_SHLX(64, uint64_t, _fallback)
3479#endif
3480#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3481EMIT_SHLX(32, uint32_t, RT_NOTHING)
3482EMIT_SHLX(32, uint32_t, _fallback)
3483#endif
3484
3485
3486/*
3487 * SHRX (BMI2)
3488 */
3489#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3490IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3491{ \
3492 cShift &= a_cBitsWidth - 1; \
3493 *puDst = uSrc >> cShift; \
3494}
3495
3496#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3497EMIT_SHRX(64, uint64_t, RT_NOTHING)
3498EMIT_SHRX(64, uint64_t, _fallback)
3499#endif
3500#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3501EMIT_SHRX(32, uint32_t, RT_NOTHING)
3502EMIT_SHRX(32, uint32_t, _fallback)
3503#endif
3504
3505
3506/*
3507 * SARX (BMI2)
3508 */
3509#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3510IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3511{ \
3512 cShift &= a_cBitsWidth - 1; \
3513 *puDst = (a_iType)uSrc >> cShift; \
3514}
3515
3516#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3517EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3518EMIT_SARX(64, uint64_t, int64_t, _fallback)
3519#endif
3520#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3521EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3522EMIT_SARX(32, uint32_t, int32_t, _fallback)
3523#endif
3524
3525
3526/*
3527 * PDEP (BMI2)
3528 */
3529#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3530IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3531{ \
3532 a_uType uResult = 0; \
3533 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3534 if (fMask & ((a_uType)1 << iMaskBit)) \
3535 { \
3536 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3537 iBit++; \
3538 } \
3539 *puDst = uResult; \
3540}
3541
3542#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3543EMIT_PDEP(64, uint64_t, RT_NOTHING)
3544#endif
3545EMIT_PDEP(64, uint64_t, _fallback)
3546#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3547EMIT_PDEP(32, uint32_t, RT_NOTHING)
3548#endif
3549EMIT_PDEP(32, uint32_t, _fallback)
3550
3551/*
3552 * PEXT (BMI2)
3553 */
3554#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PEXT(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PEXT(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PEXT(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PEXT(32, uint32_t, _fallback)
3575
3576
3577#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3578
3579# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3580/*
3581 * BSWAP
3582 */
3583
3584IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3585{
3586 *puDst = ASMByteSwapU64(*puDst);
3587}
3588
3589
3590IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3591{
3592 *puDst = ASMByteSwapU32(*puDst);
3593}
3594
3595
3596/* Note! undocument, so 32-bit arg */
3597IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3598{
3599#if 0
3600 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3601#else
3602 /* This is the behaviour AMD 3990x (64-bit mode): */
3603 *(uint16_t *)puDst = 0;
3604#endif
3605}
3606
3607# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3608
3609
3610
3611# if defined(IEM_WITHOUT_ASSEMBLY)
3612
3613/*
3614 * LFENCE, SFENCE & MFENCE.
3615 */
3616
3617IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3618{
3619 ASMReadFence();
3620}
3621
3622
3623IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3624{
3625 ASMWriteFence();
3626}
3627
3628
3629IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3630{
3631 ASMMemoryFence();
3632}
3633
3634
3635# ifndef RT_ARCH_ARM64
3636IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3637{
3638 ASMMemoryFence();
3639}
3640# endif
3641
3642# endif
3643
3644#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3645
3646
3647IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3648{
3649 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3650 {
3651 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3652 *pu16Dst |= u16Src & X86_SEL_RPL;
3653
3654 *pfEFlags |= X86_EFL_ZF;
3655 }
3656 else
3657 *pfEFlags &= ~X86_EFL_ZF;
3658}
3659
3660
3661#if defined(IEM_WITHOUT_ASSEMBLY)
3662
3663/*********************************************************************************************************************************
3664* x87 FPU Loads *
3665*********************************************************************************************************************************/
3666
3667IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3668{
3669 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3670 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3671 {
3672 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3673 pFpuRes->r80Result.sj64.fInteger = 1;
3674 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3675 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3676 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3677 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3678 }
3679 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3680 {
3681 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3682 pFpuRes->r80Result.s.uExponent = 0;
3683 pFpuRes->r80Result.s.uMantissa = 0;
3684 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3685 }
3686 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3687 {
3688 /* Subnormal values gets normalized. */
3689 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3690 pFpuRes->r80Result.sj64.fInteger = 1;
3691 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3692 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3693 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3694 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3695 pFpuRes->FSW |= X86_FSW_DE;
3696 if (!(pFpuState->FCW & X86_FCW_DM))
3697 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3698 }
3699 else if (RTFLOAT32U_IS_INF(pr32Val))
3700 {
3701 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3702 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3703 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3704 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3705 }
3706 else
3707 {
3708 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3709 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3710 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3711 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3712 pFpuRes->r80Result.sj64.fInteger = 1;
3713 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3714 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3715 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3716 {
3717 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3718 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3719 pFpuRes->FSW |= X86_FSW_IE;
3720
3721 if (!(pFpuState->FCW & X86_FCW_IM))
3722 {
3723 /* The value is not pushed. */
3724 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3725 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3726 pFpuRes->r80Result.au64[0] = 0;
3727 pFpuRes->r80Result.au16[4] = 0;
3728 }
3729 }
3730 else
3731 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3732 }
3733}
3734
3735
3736IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3737{
3738 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3739 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3740 {
3741 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3742 pFpuRes->r80Result.sj64.fInteger = 1;
3743 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3744 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3745 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3746 }
3747 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3748 {
3749 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3750 pFpuRes->r80Result.s.uExponent = 0;
3751 pFpuRes->r80Result.s.uMantissa = 0;
3752 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3753 }
3754 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3755 {
3756 /* Subnormal values gets normalized. */
3757 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3758 pFpuRes->r80Result.sj64.fInteger = 1;
3759 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3760 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3761 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3762 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3763 pFpuRes->FSW |= X86_FSW_DE;
3764 if (!(pFpuState->FCW & X86_FCW_DM))
3765 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3766 }
3767 else if (RTFLOAT64U_IS_INF(pr64Val))
3768 {
3769 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3770 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3771 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3772 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3773 }
3774 else
3775 {
3776 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3777 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3778 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3779 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3780 pFpuRes->r80Result.sj64.fInteger = 1;
3781 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3782 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3783 {
3784 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3785 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3786 pFpuRes->FSW |= X86_FSW_IE;
3787
3788 if (!(pFpuState->FCW & X86_FCW_IM))
3789 {
3790 /* The value is not pushed. */
3791 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3792 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3793 pFpuRes->r80Result.au64[0] = 0;
3794 pFpuRes->r80Result.au16[4] = 0;
3795 }
3796 }
3797 else
3798 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3799 }
3800}
3801
3802
3803IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3804{
3805 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3806 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3807 /* Raises no exceptions. */
3808 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3809}
3810
3811
3812IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3813{
3814 pFpuRes->r80Result.sj64.fSign = 0;
3815 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3816 pFpuRes->r80Result.sj64.fInteger = 1;
3817 pFpuRes->r80Result.sj64.uFraction = 0;
3818
3819 /*
3820 * FPU status word:
3821 * - TOP is irrelevant, but we must match x86 assembly version.
3822 * - C1 is always cleared as we don't have any stack overflows.
3823 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3824 */
3825 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3826}
3827
3828
3829IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3830{
3831 pFpuRes->r80Result.sj64.fSign = 0;
3832 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3833 pFpuRes->r80Result.sj64.fInteger = 1;
3834 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3835 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3836 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3837 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3838}
3839
3840
3841IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3842{
3843 pFpuRes->r80Result.sj64.fSign = 0;
3844 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3845 pFpuRes->r80Result.sj64.fInteger = 1;
3846 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3847 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3848 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3849}
3850
3851
3852IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3853{
3854 pFpuRes->r80Result.sj64.fSign = 0;
3855 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3856 pFpuRes->r80Result.sj64.fInteger = 1;
3857 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3858 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3859 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3860 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3861}
3862
3863
3864IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3865{
3866 pFpuRes->r80Result.sj64.fSign = 0;
3867 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3868 pFpuRes->r80Result.sj64.fInteger = 1;
3869 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3870 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3871 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3872 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3873}
3874
3875
3876IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3877{
3878 pFpuRes->r80Result.sj64.fSign = 0;
3879 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3880 pFpuRes->r80Result.sj64.fInteger = 1;
3881 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3882 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3883 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3884 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3885}
3886
3887
3888IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3889{
3890 pFpuRes->r80Result.s.fSign = 0;
3891 pFpuRes->r80Result.s.uExponent = 0;
3892 pFpuRes->r80Result.s.uMantissa = 0;
3893 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3894}
3895
3896#define EMIT_FILD(a_cBits) \
3897IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3898 int ## a_cBits ## _t const *piVal)) \
3899{ \
3900 int ## a_cBits ## _t iVal = *piVal; \
3901 if (iVal == 0) \
3902 { \
3903 pFpuRes->r80Result.s.fSign = 0; \
3904 pFpuRes->r80Result.s.uExponent = 0; \
3905 pFpuRes->r80Result.s.uMantissa = 0; \
3906 } \
3907 else \
3908 { \
3909 if (iVal > 0) \
3910 pFpuRes->r80Result.s.fSign = 0; \
3911 else \
3912 { \
3913 pFpuRes->r80Result.s.fSign = 1; \
3914 iVal = -iVal; \
3915 } \
3916 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3917 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3918 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3919 } \
3920 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3921}
3922EMIT_FILD(16)
3923EMIT_FILD(32)
3924EMIT_FILD(64)
3925
3926
3927IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3928{
3929 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3930 if ( pd80Val->s.abPairs[0] == 0
3931 && pd80Val->s.abPairs[1] == 0
3932 && pd80Val->s.abPairs[2] == 0
3933 && pd80Val->s.abPairs[3] == 0
3934 && pd80Val->s.abPairs[4] == 0
3935 && pd80Val->s.abPairs[5] == 0
3936 && pd80Val->s.abPairs[6] == 0
3937 && pd80Val->s.abPairs[7] == 0
3938 && pd80Val->s.abPairs[8] == 0)
3939 {
3940 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3941 pFpuRes->r80Result.s.uExponent = 0;
3942 pFpuRes->r80Result.s.uMantissa = 0;
3943 }
3944 else
3945 {
3946 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3947
3948 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3949 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3950 cPairs--;
3951
3952 uint64_t uVal = 0;
3953 uint64_t uFactor = 1;
3954 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3955 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3956 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3957
3958 unsigned const cBits = ASMBitLastSetU64(uVal);
3959 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3960 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3961 }
3962}
3963
3964
3965/*********************************************************************************************************************************
3966* x87 FPU Stores *
3967*********************************************************************************************************************************/
3968
3969/**
3970 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3971 *
3972 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3973 *
3974 * @returns Updated FPU status word value.
3975 * @param fSignIn Incoming sign indicator.
3976 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
3977 * @param iExponentIn Unbiased exponent.
3978 * @param fFcw The FPU control word.
3979 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
3980 * @param pr32Dst Where to return the output value, if one should be
3981 * returned.
3982 *
3983 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
3984 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
3985 */
3986static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
3987 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
3988{
3989 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
3990 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3991 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
3992 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
3993 ? fRoundingOffMask
3994 : 0;
3995 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
3996
3997 /*
3998 * Deal with potential overflows/underflows first, optimizing for none.
3999 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4000 */
4001 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4002 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4003 { /* likely? */ }
4004 /*
4005 * Underflow if the exponent zero or negative. This is attempted mapped
4006 * to a subnormal number when possible, with some additional trickery ofc.
4007 */
4008 else if (iExponentOut <= 0)
4009 {
4010 bool const fIsTiny = iExponentOut < 0
4011 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4012 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4013 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4014 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4015
4016 if (iExponentOut <= 0)
4017 {
4018 uMantissaIn = iExponentOut <= -63
4019 ? uMantissaIn != 0
4020 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4021 fRoundedOff = uMantissaIn & fRoundingOffMask;
4022 if (fRoundedOff && fIsTiny)
4023 fFsw |= X86_FSW_UE;
4024 iExponentOut = 0;
4025 }
4026 }
4027 /*
4028 * Overflow if at or above max exponent value or if we will reach max
4029 * when rounding. Will return +/-zero or +/-max value depending on
4030 * whether we're rounding or not.
4031 */
4032 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4033 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4034 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4035 {
4036 fFsw |= X86_FSW_OE;
4037 if (!(fFcw & X86_FCW_OM))
4038 return fFsw | X86_FSW_ES | X86_FSW_B;
4039 fFsw |= X86_FSW_PE;
4040 if (uRoundingAdd)
4041 fFsw |= X86_FSW_C1;
4042 if (!(fFcw & X86_FCW_PM))
4043 fFsw |= X86_FSW_ES | X86_FSW_B;
4044
4045 pr32Dst->s.fSign = fSignIn;
4046 if (uRoundingAdd)
4047 { /* Zero */
4048 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4049 pr32Dst->s.uFraction = 0;
4050 }
4051 else
4052 { /* Max */
4053 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4054 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4055 }
4056 return fFsw;
4057 }
4058
4059 /*
4060 * Normal or subnormal number.
4061 */
4062 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4063 uint64_t uMantissaOut = uMantissaIn;
4064 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4065 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4066 || fRoundedOff != uRoundingAdd)
4067 {
4068 uMantissaOut = uMantissaIn + uRoundingAdd;
4069 if (uMantissaOut >= uMantissaIn)
4070 { /* likely */ }
4071 else
4072 {
4073 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4074 iExponentOut++;
4075 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4076 fFsw |= X86_FSW_C1;
4077 }
4078 }
4079 else
4080 uMantissaOut = uMantissaIn;
4081
4082 /* Truncate the mantissa and set the return value. */
4083 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4084
4085 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4086 pr32Dst->s.uExponent = iExponentOut;
4087 pr32Dst->s.fSign = fSignIn;
4088
4089 /* Set status flags realted to rounding. */
4090 if (fRoundedOff)
4091 {
4092 fFsw |= X86_FSW_PE;
4093 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4094 fFsw |= X86_FSW_C1;
4095 if (!(fFcw & X86_FCW_PM))
4096 fFsw |= X86_FSW_ES | X86_FSW_B;
4097 }
4098
4099 return fFsw;
4100}
4101
4102
4103/**
4104 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4105 */
4106IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4107 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4108{
4109 uint16_t const fFcw = pFpuState->FCW;
4110 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4111 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4112 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4113 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4114 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4115 {
4116 pr32Dst->s.fSign = pr80Src->s.fSign;
4117 pr32Dst->s.uExponent = 0;
4118 pr32Dst->s.uFraction = 0;
4119 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4120 }
4121 else if (RTFLOAT80U_IS_INF(pr80Src))
4122 {
4123 pr32Dst->s.fSign = pr80Src->s.fSign;
4124 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4125 pr32Dst->s.uFraction = 0;
4126 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4127 }
4128 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4129 {
4130 /* Mapped to +/-QNaN */
4131 pr32Dst->s.fSign = pr80Src->s.fSign;
4132 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4133 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4134 }
4135 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4136 {
4137 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4138 if (fFcw & X86_FCW_IM)
4139 {
4140 pr32Dst->s.fSign = 1;
4141 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4142 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4143 fFsw |= X86_FSW_IE;
4144 }
4145 else
4146 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4147 }
4148 else if (RTFLOAT80U_IS_NAN(pr80Src))
4149 {
4150 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4151 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4152 {
4153 pr32Dst->s.fSign = pr80Src->s.fSign;
4154 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4155 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4156 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4157 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4158 fFsw |= X86_FSW_IE;
4159 }
4160 else
4161 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4162 }
4163 else
4164 {
4165 /* Denormal values causes both an underflow and precision exception. */
4166 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4167 if (fFcw & X86_FCW_UM)
4168 {
4169 pr32Dst->s.fSign = pr80Src->s.fSign;
4170 pr32Dst->s.uExponent = 0;
4171 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4172 {
4173 pr32Dst->s.uFraction = 1;
4174 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4175 if (!(fFcw & X86_FCW_PM))
4176 fFsw |= X86_FSW_ES | X86_FSW_B;
4177 }
4178 else
4179 {
4180 pr32Dst->s.uFraction = 0;
4181 fFsw |= X86_FSW_UE | X86_FSW_PE;
4182 if (!(fFcw & X86_FCW_PM))
4183 fFsw |= X86_FSW_ES | X86_FSW_B;
4184 }
4185 }
4186 else
4187 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4188 }
4189 *pu16FSW = fFsw;
4190}
4191
4192
4193/**
4194 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4195 *
4196 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4197 *
4198 * @returns Updated FPU status word value.
4199 * @param fSignIn Incoming sign indicator.
4200 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4201 * @param iExponentIn Unbiased exponent.
4202 * @param fFcw The FPU control word.
4203 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4204 * @param pr64Dst Where to return the output value, if one should be
4205 * returned.
4206 *
4207 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4208 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4209 */
4210static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4211 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4212{
4213 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4214 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4215 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4216 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4217 ? fRoundingOffMask
4218 : 0;
4219 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4220
4221 /*
4222 * Deal with potential overflows/underflows first, optimizing for none.
4223 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4224 */
4225 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4226 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4227 { /* likely? */ }
4228 /*
4229 * Underflow if the exponent zero or negative. This is attempted mapped
4230 * to a subnormal number when possible, with some additional trickery ofc.
4231 */
4232 else if (iExponentOut <= 0)
4233 {
4234 bool const fIsTiny = iExponentOut < 0
4235 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4236 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4237 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4238 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4239
4240 if (iExponentOut <= 0)
4241 {
4242 uMantissaIn = iExponentOut <= -63
4243 ? uMantissaIn != 0
4244 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4245 fRoundedOff = uMantissaIn & fRoundingOffMask;
4246 if (fRoundedOff && fIsTiny)
4247 fFsw |= X86_FSW_UE;
4248 iExponentOut = 0;
4249 }
4250 }
4251 /*
4252 * Overflow if at or above max exponent value or if we will reach max
4253 * when rounding. Will return +/-zero or +/-max value depending on
4254 * whether we're rounding or not.
4255 */
4256 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4257 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4258 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4259 {
4260 fFsw |= X86_FSW_OE;
4261 if (!(fFcw & X86_FCW_OM))
4262 return fFsw | X86_FSW_ES | X86_FSW_B;
4263 fFsw |= X86_FSW_PE;
4264 if (uRoundingAdd)
4265 fFsw |= X86_FSW_C1;
4266 if (!(fFcw & X86_FCW_PM))
4267 fFsw |= X86_FSW_ES | X86_FSW_B;
4268
4269 pr64Dst->s64.fSign = fSignIn;
4270 if (uRoundingAdd)
4271 { /* Zero */
4272 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4273 pr64Dst->s64.uFraction = 0;
4274 }
4275 else
4276 { /* Max */
4277 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4278 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4279 }
4280 return fFsw;
4281 }
4282
4283 /*
4284 * Normal or subnormal number.
4285 */
4286 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4287 uint64_t uMantissaOut = uMantissaIn;
4288 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4289 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4290 || fRoundedOff != uRoundingAdd)
4291 {
4292 uMantissaOut = uMantissaIn + uRoundingAdd;
4293 if (uMantissaOut >= uMantissaIn)
4294 { /* likely */ }
4295 else
4296 {
4297 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4298 iExponentOut++;
4299 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4300 fFsw |= X86_FSW_C1;
4301 }
4302 }
4303 else
4304 uMantissaOut = uMantissaIn;
4305
4306 /* Truncate the mantissa and set the return value. */
4307 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4308
4309 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4310 pr64Dst->s64.uExponent = iExponentOut;
4311 pr64Dst->s64.fSign = fSignIn;
4312
4313 /* Set status flags realted to rounding. */
4314 if (fRoundedOff)
4315 {
4316 fFsw |= X86_FSW_PE;
4317 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4318 fFsw |= X86_FSW_C1;
4319 if (!(fFcw & X86_FCW_PM))
4320 fFsw |= X86_FSW_ES | X86_FSW_B;
4321 }
4322
4323 return fFsw;
4324}
4325
4326
4327/**
4328 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4329 */
4330IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4331 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4332{
4333 uint16_t const fFcw = pFpuState->FCW;
4334 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4335 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4336 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4337 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4338 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4339 {
4340 pr64Dst->s64.fSign = pr80Src->s.fSign;
4341 pr64Dst->s64.uExponent = 0;
4342 pr64Dst->s64.uFraction = 0;
4343 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4344 }
4345 else if (RTFLOAT80U_IS_INF(pr80Src))
4346 {
4347 pr64Dst->s64.fSign = pr80Src->s.fSign;
4348 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4349 pr64Dst->s64.uFraction = 0;
4350 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4351 }
4352 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4353 {
4354 /* Mapped to +/-QNaN */
4355 pr64Dst->s64.fSign = pr80Src->s.fSign;
4356 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4357 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4358 }
4359 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4360 {
4361 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4362 if (fFcw & X86_FCW_IM)
4363 {
4364 pr64Dst->s64.fSign = 1;
4365 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4366 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4367 fFsw |= X86_FSW_IE;
4368 }
4369 else
4370 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4371 }
4372 else if (RTFLOAT80U_IS_NAN(pr80Src))
4373 {
4374 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4375 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4376 {
4377 pr64Dst->s64.fSign = pr80Src->s.fSign;
4378 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4379 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4380 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4381 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4382 fFsw |= X86_FSW_IE;
4383 }
4384 else
4385 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4386 }
4387 else
4388 {
4389 /* Denormal values causes both an underflow and precision exception. */
4390 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4391 if (fFcw & X86_FCW_UM)
4392 {
4393 pr64Dst->s64.fSign = pr80Src->s.fSign;
4394 pr64Dst->s64.uExponent = 0;
4395 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4396 {
4397 pr64Dst->s64.uFraction = 1;
4398 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4399 if (!(fFcw & X86_FCW_PM))
4400 fFsw |= X86_FSW_ES | X86_FSW_B;
4401 }
4402 else
4403 {
4404 pr64Dst->s64.uFraction = 0;
4405 fFsw |= X86_FSW_UE | X86_FSW_PE;
4406 if (!(fFcw & X86_FCW_PM))
4407 fFsw |= X86_FSW_ES | X86_FSW_B;
4408 }
4409 }
4410 else
4411 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4412 }
4413 *pu16FSW = fFsw;
4414}
4415
4416
4417IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4418 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4419{
4420 /*
4421 * FPU status word:
4422 * - TOP is irrelevant, but we must match x86 assembly version (0).
4423 * - C1 is always cleared as we don't have any stack overflows.
4424 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4425 */
4426 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4427 *pr80Dst = *pr80Src;
4428}
4429
4430
4431/*
4432 *
4433 * Mantissa:
4434 * 63 56 48 40 32 24 16 8 0
4435 * v v v v v v v v v
4436 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4437 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4438 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4439 *
4440 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4441 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4442 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4443 * where we'll drop off all but bit 63.
4444 */
4445#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4446IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4447 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4448{ \
4449 uint16_t const fFcw = pFpuState->FCW; \
4450 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4451 bool const fSignIn = pr80Val->s.fSign; \
4452 \
4453 /* \
4454 * Deal with normal numbers first. \
4455 */ \
4456 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4457 { \
4458 uint64_t uMantissa = pr80Val->s.uMantissa; \
4459 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4460 \
4461 if ((uint32_t)iExponent <= a_cBits - 2) \
4462 { \
4463 unsigned const cShiftOff = 63 - iExponent; \
4464 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4465 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4466 ? RT_BIT_64(cShiftOff - 1) \
4467 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4468 ? fRoundingOffMask \
4469 : 0; \
4470 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4471 \
4472 uMantissa >>= cShiftOff; \
4473 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4474 uMantissa += uRounding; \
4475 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4476 { \
4477 if (fRoundedOff) \
4478 { \
4479 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4480 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4481 else if (uRounding) \
4482 fFsw |= X86_FSW_C1; \
4483 fFsw |= X86_FSW_PE; \
4484 if (!(fFcw & X86_FCW_PM)) \
4485 fFsw |= X86_FSW_ES | X86_FSW_B; \
4486 } \
4487 \
4488 if (!fSignIn) \
4489 *piDst = (a_iType)uMantissa; \
4490 else \
4491 *piDst = -(a_iType)uMantissa; \
4492 } \
4493 else \
4494 { \
4495 /* overflowed after rounding. */ \
4496 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4497 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4498 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4499 \
4500 /* Special case for the integer minimum value. */ \
4501 if (fSignIn) \
4502 { \
4503 *piDst = a_iTypeMin; \
4504 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4505 if (!(fFcw & X86_FCW_PM)) \
4506 fFsw |= X86_FSW_ES | X86_FSW_B; \
4507 } \
4508 else \
4509 { \
4510 fFsw |= X86_FSW_IE; \
4511 if (fFcw & X86_FCW_IM) \
4512 *piDst = a_iTypeMin; \
4513 else \
4514 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4515 } \
4516 } \
4517 } \
4518 /* \
4519 * Tiny sub-zero numbers. \
4520 */ \
4521 else if (iExponent < 0) \
4522 { \
4523 if (!fSignIn) \
4524 { \
4525 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4526 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4527 { \
4528 *piDst = 1; \
4529 fFsw |= X86_FSW_C1; \
4530 } \
4531 else \
4532 *piDst = 0; \
4533 } \
4534 else \
4535 { \
4536 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4537 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4538 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4539 *piDst = 0; \
4540 else \
4541 { \
4542 *piDst = -1; \
4543 fFsw |= X86_FSW_C1; \
4544 } \
4545 } \
4546 fFsw |= X86_FSW_PE; \
4547 if (!(fFcw & X86_FCW_PM)) \
4548 fFsw |= X86_FSW_ES | X86_FSW_B; \
4549 } \
4550 /* \
4551 * Special MIN case. \
4552 */ \
4553 else if ( fSignIn && iExponent == a_cBits - 1 \
4554 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4555 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4556 : uMantissa == RT_BIT_64(63))) \
4557 { \
4558 *piDst = a_iTypeMin; \
4559 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4560 { \
4561 fFsw |= X86_FSW_PE; \
4562 if (!(fFcw & X86_FCW_PM)) \
4563 fFsw |= X86_FSW_ES | X86_FSW_B; \
4564 } \
4565 } \
4566 /* \
4567 * Too large/small number outside the target integer range. \
4568 */ \
4569 else \
4570 { \
4571 fFsw |= X86_FSW_IE; \
4572 if (fFcw & X86_FCW_IM) \
4573 *piDst = a_iTypeIndefinite; \
4574 else \
4575 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4576 } \
4577 } \
4578 /* \
4579 * Map both +0 and -0 to integer zero (signless/+). \
4580 */ \
4581 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4582 *piDst = 0; \
4583 /* \
4584 * Denormals are just really tiny sub-zero numbers that are either rounded \
4585 * to zero, 1 or -1 depending on sign and rounding control. \
4586 */ \
4587 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4588 { \
4589 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4590 *piDst = 0; \
4591 else \
4592 { \
4593 *piDst = fSignIn ? -1 : 1; \
4594 fFsw |= X86_FSW_C1; \
4595 } \
4596 fFsw |= X86_FSW_PE; \
4597 if (!(fFcw & X86_FCW_PM)) \
4598 fFsw |= X86_FSW_ES | X86_FSW_B; \
4599 } \
4600 /* \
4601 * All other special values are considered invalid arguments and result \
4602 * in an IE exception and indefinite value if masked. \
4603 */ \
4604 else \
4605 { \
4606 fFsw |= X86_FSW_IE; \
4607 if (fFcw & X86_FCW_IM) \
4608 *piDst = a_iTypeIndefinite; \
4609 else \
4610 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4611 } \
4612 *pu16FSW = fFsw; \
4613}
4614EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4615EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4616EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4617
4618#endif /*IEM_WITHOUT_ASSEMBLY */
4619
4620
4621/*
4622 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4623 *
4624 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4625 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4626 * thus the @a a_cBitsIn.
4627 */
4628#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4629IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4630 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4631{ \
4632 uint16_t const fFcw = pFpuState->FCW; \
4633 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4634 bool const fSignIn = pr80Val->s.fSign; \
4635 \
4636 /* \
4637 * Deal with normal numbers first. \
4638 */ \
4639 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4640 { \
4641 uint64_t uMantissa = pr80Val->s.uMantissa; \
4642 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4643 \
4644 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4645 { \
4646 unsigned const cShiftOff = 63 - iExponent; \
4647 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4648 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4649 uMantissa >>= cShiftOff; \
4650 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4651 if (!fSignIn) \
4652 *piDst = (a_iType)uMantissa; \
4653 else \
4654 *piDst = -(a_iType)uMantissa; \
4655 \
4656 if (fRoundedOff) \
4657 { \
4658 fFsw |= X86_FSW_PE; \
4659 if (!(fFcw & X86_FCW_PM)) \
4660 fFsw |= X86_FSW_ES | X86_FSW_B; \
4661 } \
4662 } \
4663 /* \
4664 * Tiny sub-zero numbers. \
4665 */ \
4666 else if (iExponent < 0) \
4667 { \
4668 *piDst = 0; \
4669 fFsw |= X86_FSW_PE; \
4670 if (!(fFcw & X86_FCW_PM)) \
4671 fFsw |= X86_FSW_ES | X86_FSW_B; \
4672 } \
4673 /* \
4674 * Special MIN case. \
4675 */ \
4676 else if ( fSignIn && iExponent == a_cBits - 1 \
4677 && (a_cBits < 64 \
4678 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4679 : uMantissa == RT_BIT_64(63)) ) \
4680 { \
4681 *piDst = a_iTypeMin; \
4682 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4683 { \
4684 fFsw |= X86_FSW_PE; \
4685 if (!(fFcw & X86_FCW_PM)) \
4686 fFsw |= X86_FSW_ES | X86_FSW_B; \
4687 } \
4688 } \
4689 /* \
4690 * Figure this weirdness. \
4691 */ \
4692 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4693 { \
4694 *piDst = 0; \
4695 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4696 { \
4697 fFsw |= X86_FSW_PE; \
4698 if (!(fFcw & X86_FCW_PM)) \
4699 fFsw |= X86_FSW_ES | X86_FSW_B; \
4700 } \
4701 } \
4702 /* \
4703 * Too large/small number outside the target integer range. \
4704 */ \
4705 else \
4706 { \
4707 fFsw |= X86_FSW_IE; \
4708 if (fFcw & X86_FCW_IM) \
4709 *piDst = a_iTypeIndefinite; \
4710 else \
4711 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4712 } \
4713 } \
4714 /* \
4715 * Map both +0 and -0 to integer zero (signless/+). \
4716 */ \
4717 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4718 *piDst = 0; \
4719 /* \
4720 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4721 */ \
4722 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4723 { \
4724 *piDst = 0; \
4725 fFsw |= X86_FSW_PE; \
4726 if (!(fFcw & X86_FCW_PM)) \
4727 fFsw |= X86_FSW_ES | X86_FSW_B; \
4728 } \
4729 /* \
4730 * All other special values are considered invalid arguments and result \
4731 * in an IE exception and indefinite value if masked. \
4732 */ \
4733 else \
4734 { \
4735 fFsw |= X86_FSW_IE; \
4736 if (fFcw & X86_FCW_IM) \
4737 *piDst = a_iTypeIndefinite; \
4738 else \
4739 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4740 } \
4741 *pu16FSW = fFsw; \
4742}
4743#if defined(IEM_WITHOUT_ASSEMBLY)
4744EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4745EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4746EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4747#endif
4748EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4749EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4750
4751
4752#if defined(IEM_WITHOUT_ASSEMBLY)
4753
4754IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4755 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4756{
4757 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4758 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4759 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4760 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4761 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4762
4763 uint16_t const fFcw = pFpuState->FCW;
4764 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4765 bool const fSignIn = pr80Src->s.fSign;
4766
4767 /*
4768 * Deal with normal numbers first.
4769 */
4770 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4771 {
4772 uint64_t uMantissa = pr80Src->s.uMantissa;
4773 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4774 if ( (uint32_t)iExponent <= 58
4775 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4776 {
4777 unsigned const cShiftOff = 63 - iExponent;
4778 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4779 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4780 ? RT_BIT_64(cShiftOff - 1)
4781 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4782 ? fRoundingOffMask
4783 : 0;
4784 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4785
4786 uMantissa >>= cShiftOff;
4787 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4788 uMantissa += uRounding;
4789 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4790 {
4791 if (fRoundedOff)
4792 {
4793 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4794 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4795 else if (uRounding)
4796 fFsw |= X86_FSW_C1;
4797 fFsw |= X86_FSW_PE;
4798 if (!(fFcw & X86_FCW_PM))
4799 fFsw |= X86_FSW_ES | X86_FSW_B;
4800 }
4801
4802 pd80Dst->s.fSign = fSignIn;
4803 pd80Dst->s.uPad = 0;
4804 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4805 {
4806 unsigned const uDigits = uMantissa % 100;
4807 uMantissa /= 100;
4808 uint8_t const bLo = uDigits % 10;
4809 uint8_t const bHi = uDigits / 10;
4810 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4811 }
4812 }
4813 else
4814 {
4815 /* overflowed after rounding. */
4816 fFsw |= X86_FSW_IE;
4817 if (fFcw & X86_FCW_IM)
4818 *pd80Dst = s_d80Indefinite;
4819 else
4820 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4821 }
4822 }
4823 /*
4824 * Tiny sub-zero numbers.
4825 */
4826 else if (iExponent < 0)
4827 {
4828 if (!fSignIn)
4829 {
4830 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4831 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4832 {
4833 *pd80Dst = s_ad80One[fSignIn];
4834 fFsw |= X86_FSW_C1;
4835 }
4836 else
4837 *pd80Dst = s_ad80Zeros[fSignIn];
4838 }
4839 else
4840 {
4841 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4842 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4843 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4844 *pd80Dst = s_ad80Zeros[fSignIn];
4845 else
4846 {
4847 *pd80Dst = s_ad80One[fSignIn];
4848 fFsw |= X86_FSW_C1;
4849 }
4850 }
4851 fFsw |= X86_FSW_PE;
4852 if (!(fFcw & X86_FCW_PM))
4853 fFsw |= X86_FSW_ES | X86_FSW_B;
4854 }
4855 /*
4856 * Too large/small number outside the target integer range.
4857 */
4858 else
4859 {
4860 fFsw |= X86_FSW_IE;
4861 if (fFcw & X86_FCW_IM)
4862 *pd80Dst = s_d80Indefinite;
4863 else
4864 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4865 }
4866 }
4867 /*
4868 * Map both +0 and -0 to integer zero (signless/+).
4869 */
4870 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4871 *pd80Dst = s_ad80Zeros[fSignIn];
4872 /*
4873 * Denormals are just really tiny sub-zero numbers that are either rounded
4874 * to zero, 1 or -1 depending on sign and rounding control.
4875 */
4876 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4877 {
4878 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4879 *pd80Dst = s_ad80Zeros[fSignIn];
4880 else
4881 {
4882 *pd80Dst = s_ad80One[fSignIn];
4883 fFsw |= X86_FSW_C1;
4884 }
4885 fFsw |= X86_FSW_PE;
4886 if (!(fFcw & X86_FCW_PM))
4887 fFsw |= X86_FSW_ES | X86_FSW_B;
4888 }
4889 /*
4890 * All other special values are considered invalid arguments and result
4891 * in an IE exception and indefinite value if masked.
4892 */
4893 else
4894 {
4895 fFsw |= X86_FSW_IE;
4896 if (fFcw & X86_FCW_IM)
4897 *pd80Dst = s_d80Indefinite;
4898 else
4899 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4900 }
4901 *pu16FSW = fFsw;
4902}
4903
4904
4905/*********************************************************************************************************************************
4906* FPU Helpers *
4907*********************************************************************************************************************************/
4908AssertCompileSize(RTFLOAT128U, 16);
4909AssertCompileSize(RTFLOAT80U, 10);
4910AssertCompileSize(RTFLOAT64U, 8);
4911AssertCompileSize(RTFLOAT32U, 4);
4912
4913/**
4914 * Normalizes a possible pseudo-normal value.
4915 *
4916 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4917 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4918 * i.e. changing uExponent from 0 to 1.
4919 *
4920 * This macro will declare a RTFLOAT80U with the name given by
4921 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4922 * a normalization was performed.
4923 *
4924 * @note This must be applied before calling SoftFloat with a value that couldbe
4925 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4926 * correctly.
4927 */
4928#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4929 RTFLOAT80U a_r80ValNormalized; \
4930 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4931 { \
4932 a_r80ValNormalized = *a_pr80Val; \
4933 a_r80ValNormalized.s.uExponent = 1; \
4934 a_pr80Val = &a_r80ValNormalized; \
4935 } else do {} while (0)
4936
4937#ifdef IEM_WITH_FLOAT128_FOR_FPU
4938
4939DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4940{
4941 int fNew;
4942 switch (fFcw & X86_FCW_RC_MASK)
4943 {
4944 default:
4945 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4946 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4947 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4948 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4949 }
4950 int fOld = fegetround();
4951 fesetround(fNew);
4952 return fOld;
4953}
4954
4955
4956DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4957{
4958 fesetround(fOld);
4959}
4960
4961DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4962{
4963 RT_NOREF(fFcw);
4964 RTFLOAT128U Tmp;
4965 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4966 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4967 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4968 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4969 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4970 {
4971 Assert(Tmp.s.uExponent == 0);
4972 Tmp.s2.uSignAndExponent++;
4973 }
4974 return *(_Float128 *)&Tmp;
4975}
4976
4977
4978DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
4979{
4980 RT_NOREF(fFcw);
4981 RTFLOAT128U Tmp;
4982 *(_Float128 *)&Tmp = rd128ValSrc;
4983 ASMCompilerBarrier();
4984 if (RTFLOAT128U_IS_NORMAL(&Tmp))
4985 {
4986 pr80Dst->s.fSign = Tmp.s64.fSign;
4987 pr80Dst->s.uExponent = Tmp.s64.uExponent;
4988 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
4989 | Tmp.s64.uFractionLo >> (64 - 15);
4990
4991 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4992 unsigned const cShiftOff = 64 - 15;
4993 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4994 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
4995 if (uRoundedOff)
4996 {
4997 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4998 ? RT_BIT_64(cShiftOff - 1)
4999 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5000 ? fRoundingOffMask
5001 : 0;
5002 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5003 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5004 || uRoundedOff != uRoundingAdd)
5005 {
5006 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5007 {
5008 uFraction += 1;
5009 if (!(uFraction & RT_BIT_64(63)))
5010 { /* likely */ }
5011 else
5012 {
5013 uFraction >>= 1;
5014 pr80Dst->s.uExponent++;
5015 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5016 return fFsw;
5017 }
5018 fFsw |= X86_FSW_C1;
5019 }
5020 }
5021 fFsw |= X86_FSW_PE;
5022 if (!(fFcw & X86_FCW_PM))
5023 fFsw |= X86_FSW_ES | X86_FSW_B;
5024 }
5025 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5026 }
5027 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5028 {
5029 pr80Dst->s.fSign = Tmp.s64.fSign;
5030 pr80Dst->s.uExponent = 0;
5031 pr80Dst->s.uMantissa = 0;
5032 }
5033 else if (RTFLOAT128U_IS_INF(&Tmp))
5034 {
5035 pr80Dst->s.fSign = Tmp.s64.fSign;
5036 pr80Dst->s.uExponent = 0;
5037 pr80Dst->s.uMantissa = 0;
5038 }
5039 return fFsw;
5040}
5041
5042
5043#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5044
5045/** Initializer for the SoftFloat state structure. */
5046# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5047 { \
5048 softfloat_tininess_afterRounding, \
5049 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5050 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5051 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5052 : (uint8_t)softfloat_round_minMag, \
5053 0, \
5054 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5055 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5056 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5057 }
5058
5059/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5060# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5061 ( (a_fFsw) \
5062 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5063 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5064 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5065 ? X86_FSW_ES | X86_FSW_B : 0) )
5066
5067
5068DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5069{
5070 RT_NOREF(fFcw);
5071 Assert(cBits > 64);
5072# if 0 /* rounding does not seem to help */
5073 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5074 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5075 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5076 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5077 {
5078 uint64_t uOld = r128.v[0];
5079 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5080 if (r128.v[0] < uOld)
5081 r128.v[1] += 1;
5082 }
5083# else
5084 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5085# endif
5086 return r128;
5087}
5088
5089
5090DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5091{
5092 RT_NOREF(fFcw);
5093 Assert(cBits > 64);
5094# if 0 /* rounding does not seem to help, not even on constants */
5095 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5096 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5097 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5098 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5099 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5100 {
5101 uint64_t uOld = r128.v[0];
5102 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5103 if (r128.v[0] < uOld)
5104 r128.v[1] += 1;
5105 }
5106 return r128;
5107# else
5108 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5109 return r128;
5110# endif
5111}
5112
5113
5114# if 0 /* unused */
5115DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5116{
5117 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5118 return r128;
5119}
5120# endif
5121
5122
5123/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5124DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5125{
5126 extFloat80_t Tmp;
5127 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5128 Tmp.signif = pr80Val->s2.uMantissa;
5129 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5130 return extF80_to_f128(Tmp, &Ignored);
5131}
5132
5133
5134/**
5135 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5136 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5137 *
5138 * This is only a structure format conversion, nothing else.
5139 */
5140DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5141{
5142 extFloat80_t Tmp;
5143 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5144 Tmp.signif = pr80Val->s2.uMantissa;
5145 return Tmp;
5146}
5147
5148
5149/**
5150 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5151 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5152 *
5153 * This is only a structure format conversion, nothing else.
5154 */
5155DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5156{
5157 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5158 pr80Dst->s2.uMantissa = r80XSrc.signif;
5159 return pr80Dst;
5160}
5161
5162
5163DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5164{
5165 RT_NOREF(fFcw);
5166 RTFLOAT128U Tmp;
5167 *(float128_t *)&Tmp = r128Src;
5168 ASMCompilerBarrier();
5169
5170 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5171 {
5172 pr80Dst->s.fSign = Tmp.s64.fSign;
5173 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5174 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5175 | Tmp.s64.uFractionLo >> (64 - 15);
5176
5177 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5178 unsigned const cShiftOff = 64 - 15;
5179 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5180 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5181 if (uRoundedOff)
5182 {
5183 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5184 ? RT_BIT_64(cShiftOff - 1)
5185 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5186 ? fRoundingOffMask
5187 : 0;
5188 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5189 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5190 || uRoundedOff != uRoundingAdd)
5191 {
5192 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5193 {
5194 uFraction += 1;
5195 if (!(uFraction & RT_BIT_64(63)))
5196 { /* likely */ }
5197 else
5198 {
5199 uFraction >>= 1;
5200 pr80Dst->s.uExponent++;
5201 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5202 return fFsw;
5203 }
5204 fFsw |= X86_FSW_C1;
5205 }
5206 }
5207 fFsw |= X86_FSW_PE;
5208 if (!(fFcw & X86_FCW_PM))
5209 fFsw |= X86_FSW_ES | X86_FSW_B;
5210 }
5211
5212 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5213 }
5214 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5215 {
5216 pr80Dst->s.fSign = Tmp.s64.fSign;
5217 pr80Dst->s.uExponent = 0;
5218 pr80Dst->s.uMantissa = 0;
5219 }
5220 else if (RTFLOAT128U_IS_INF(&Tmp))
5221 {
5222 pr80Dst->s.fSign = Tmp.s64.fSign;
5223 pr80Dst->s.uExponent = 0;
5224 pr80Dst->s.uMantissa = 0;
5225 }
5226 return fFsw;
5227}
5228
5229
5230/**
5231 * Helper for transfering exception and C1 to FSW and setting the result value
5232 * accordingly.
5233 *
5234 * @returns Updated FSW.
5235 * @param pSoftState The SoftFloat state following the operation.
5236 * @param r80XResult The result of the SoftFloat operation.
5237 * @param pr80Result Where to store the result for IEM.
5238 * @param fFcw The FPU control word.
5239 * @param fFsw The FSW before the operation, with necessary bits
5240 * cleared and such.
5241 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5242 * raised.
5243 */
5244DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5245 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5246 PCRTFLOAT80U pr80XcptResult)
5247{
5248 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5249 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5250 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5251 fFsw |= X86_FSW_ES | X86_FSW_B;
5252
5253 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5254 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5255 else
5256 {
5257 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5258 *pr80Result = *pr80XcptResult;
5259 }
5260 return fFsw;
5261}
5262
5263
5264/**
5265 * Helper doing polynomial evaluation using Horner's method.
5266 *
5267 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5268 */
5269float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5270 unsigned cPrecision, softfloat_state_t *pSoftState)
5271{
5272 Assert(cHornerConsts > 1);
5273 size_t i = cHornerConsts - 1;
5274 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5275 while (i-- > 0)
5276 {
5277 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5278 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5279 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5280 }
5281 return r128Result;
5282}
5283
5284#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5285
5286
5287/**
5288 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5289 * mantissa, exponent and sign.
5290 *
5291 * @returns Updated FSW.
5292 * @param pr80Dst Where to return the composed value.
5293 * @param fSign The sign.
5294 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5295 * ignored and should be zero. This will probably be
5296 * modified during normalization and rounding.
5297 * @param iExponent Unbiased exponent.
5298 * @param fFcw The FPU control word.
5299 * @param fFsw The FPU status word.
5300 */
5301static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5302 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5303{
5304 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5305
5306 iExponent += RTFLOAT80U_EXP_BIAS;
5307
5308 /* Do normalization if necessary and possible. */
5309 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5310 {
5311 int cShift = 192 - RTUInt256BitCount(puMantissa);
5312 if (iExponent > cShift)
5313 iExponent -= cShift;
5314 else
5315 {
5316 if (fFcw & X86_FCW_UM)
5317 {
5318 if (iExponent > 0)
5319 cShift = --iExponent;
5320 else
5321 cShift = 0;
5322 }
5323 iExponent -= cShift;
5324 }
5325 RTUInt256AssignShiftLeft(puMantissa, cShift);
5326 }
5327
5328 /* Do rounding. */
5329 uint64_t uMantissa = puMantissa->QWords.qw2;
5330 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5331 {
5332 bool fAdd;
5333 switch (fFcw & X86_FCW_RC_MASK)
5334 {
5335 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5336 case X86_FCW_RC_NEAREST:
5337 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5338 {
5339 if ( (uMantissa & 1)
5340 || puMantissa->QWords.qw0 != 0
5341 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5342 {
5343 fAdd = true;
5344 break;
5345 }
5346 uMantissa &= ~(uint64_t)1;
5347 }
5348 fAdd = false;
5349 break;
5350 case X86_FCW_RC_ZERO:
5351 fAdd = false;
5352 break;
5353 case X86_FCW_RC_UP:
5354 fAdd = !fSign;
5355 break;
5356 case X86_FCW_RC_DOWN:
5357 fAdd = fSign;
5358 break;
5359 }
5360 if (fAdd)
5361 {
5362 uint64_t const uTmp = uMantissa;
5363 uMantissa = uTmp + 1;
5364 if (uMantissa < uTmp)
5365 {
5366 uMantissa >>= 1;
5367 uMantissa |= RT_BIT_64(63);
5368 iExponent++;
5369 }
5370 fFsw |= X86_FSW_C1;
5371 }
5372 fFsw |= X86_FSW_PE;
5373 if (!(fFcw & X86_FCW_PM))
5374 fFsw |= X86_FSW_ES | X86_FSW_B;
5375 }
5376
5377 /* Check for underflow (denormals). */
5378 if (iExponent <= 0)
5379 {
5380 if (fFcw & X86_FCW_UM)
5381 {
5382 if (uMantissa & RT_BIT_64(63))
5383 uMantissa >>= 1;
5384 iExponent = 0;
5385 }
5386 else
5387 {
5388 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5389 fFsw |= X86_FSW_ES | X86_FSW_B;
5390 }
5391 fFsw |= X86_FSW_UE;
5392 }
5393 /* Check for overflow */
5394 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5395 {
5396 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5397 }
5398
5399 /* Compose the result. */
5400 pr80Dst->s.uMantissa = uMantissa;
5401 pr80Dst->s.uExponent = iExponent;
5402 pr80Dst->s.fSign = fSign;
5403 return fFsw;
5404}
5405
5406
5407/**
5408 * See also iemAImpl_fld_r80_from_r32
5409 */
5410static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5411{
5412 uint16_t fFsw = 0;
5413 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5414 {
5415 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5416 pr80Dst->sj64.fInteger = 1;
5417 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5418 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5419 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5420 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5421 }
5422 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5423 {
5424 pr80Dst->s.fSign = pr32Val->s.fSign;
5425 pr80Dst->s.uExponent = 0;
5426 pr80Dst->s.uMantissa = 0;
5427 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5428 }
5429 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5430 {
5431 /* Subnormal -> normalized + X86_FSW_DE return. */
5432 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5433 pr80Dst->sj64.fInteger = 1;
5434 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5435 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5436 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5437 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5438 fFsw = X86_FSW_DE;
5439 }
5440 else if (RTFLOAT32U_IS_INF(pr32Val))
5441 {
5442 pr80Dst->s.fSign = pr32Val->s.fSign;
5443 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5444 pr80Dst->s.uMantissa = RT_BIT_64(63);
5445 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5446 }
5447 else
5448 {
5449 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5450 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5451 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5452 pr80Dst->sj64.fInteger = 1;
5453 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5454 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5455 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5456 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5457 }
5458 return fFsw;
5459}
5460
5461
5462/**
5463 * See also iemAImpl_fld_r80_from_r64
5464 */
5465static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5466{
5467 uint16_t fFsw = 0;
5468 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5469 {
5470 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5471 pr80Dst->sj64.fInteger = 1;
5472 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5473 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5474 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5475 }
5476 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5477 {
5478 pr80Dst->s.fSign = pr64Val->s.fSign;
5479 pr80Dst->s.uExponent = 0;
5480 pr80Dst->s.uMantissa = 0;
5481 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5482 }
5483 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5484 {
5485 /* Subnormal values gets normalized. */
5486 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5487 pr80Dst->sj64.fInteger = 1;
5488 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5489 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5490 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5491 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5492 fFsw = X86_FSW_DE;
5493 }
5494 else if (RTFLOAT64U_IS_INF(pr64Val))
5495 {
5496 pr80Dst->s.fSign = pr64Val->s.fSign;
5497 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5498 pr80Dst->s.uMantissa = RT_BIT_64(63);
5499 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5500 }
5501 else
5502 {
5503 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5504 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5505 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5506 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5507 pr80Dst->sj64.fInteger = 1;
5508 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5509 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5510 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5511 }
5512 return fFsw;
5513}
5514
5515
5516/**
5517 * See also EMIT_FILD.
5518 */
5519#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5520static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5521{ \
5522 if (iVal == 0) \
5523 { \
5524 pr80Dst->s.fSign = 0; \
5525 pr80Dst->s.uExponent = 0; \
5526 pr80Dst->s.uMantissa = 0; \
5527 } \
5528 else \
5529 { \
5530 if (iVal > 0) \
5531 pr80Dst->s.fSign = 0; \
5532 else \
5533 { \
5534 pr80Dst->s.fSign = 1; \
5535 iVal = -iVal; \
5536 } \
5537 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5538 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5539 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5540 } \
5541 return pr80Dst; \
5542}
5543EMIT_CONVERT_IXX_TO_R80(16)
5544EMIT_CONVERT_IXX_TO_R80(32)
5545//EMIT_CONVERT_IXX_TO_R80(64)
5546
5547/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5548#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5549IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5550{ \
5551 RTFLOAT80U r80Val2; \
5552 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5553 Assert(!fFsw || fFsw == X86_FSW_DE); \
5554 if (fFsw) \
5555 { \
5556 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5557 fFsw = 0; \
5558 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5559 { \
5560 pFpuRes->r80Result = *pr80Val1; \
5561 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5562 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5563 return; \
5564 } \
5565 } \
5566 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5567 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5568}
5569
5570/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5571#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5572IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5573{ \
5574 RTFLOAT80U r80Val2; \
5575 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5576 Assert(!fFsw || fFsw == X86_FSW_DE); \
5577 if (fFsw) \
5578 { \
5579 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5580 fFsw = 0; \
5581 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5582 { \
5583 pFpuRes->r80Result = *pr80Val1; \
5584 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5585 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5586 return; \
5587 } \
5588 } \
5589 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5590 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5591}
5592
5593/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5594#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5595IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5596{ \
5597 RTFLOAT80U r80Val2; \
5598 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5599 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5600}
5601
5602/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5603#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5604IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5605{ \
5606 RTFLOAT80U r80Val2; \
5607 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5608 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5609}
5610
5611
5612
5613/*********************************************************************************************************************************
5614* x86 FPU Division Operations *
5615*********************************************************************************************************************************/
5616
5617/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5618static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5619 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5620{
5621 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5622 {
5623 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5624 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5625 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5626 }
5627 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5628 { /* Div by zero. */
5629 if (fFcw & X86_FCW_ZM)
5630 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5631 else
5632 {
5633 *pr80Result = *pr80Val1Org;
5634 fFsw |= X86_FSW_ES | X86_FSW_B;
5635 }
5636 fFsw |= X86_FSW_ZE;
5637 }
5638 else
5639 { /* Invalid operand */
5640 if (fFcw & X86_FCW_IM)
5641 *pr80Result = g_r80Indefinite;
5642 else
5643 {
5644 *pr80Result = *pr80Val1Org;
5645 fFsw |= X86_FSW_ES | X86_FSW_B;
5646 }
5647 fFsw |= X86_FSW_IE;
5648 }
5649 return fFsw;
5650}
5651
5652
5653IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5654 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5655{
5656 uint16_t const fFcw = pFpuState->FCW;
5657 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5658
5659 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5660 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5661 {
5662 if (fFcw & X86_FCW_IM)
5663 pFpuRes->r80Result = g_r80Indefinite;
5664 else
5665 {
5666 pFpuRes->r80Result = *pr80Val1;
5667 fFsw |= X86_FSW_ES | X86_FSW_B;
5668 }
5669 fFsw |= X86_FSW_IE;
5670 }
5671 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5672 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5673 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5674 {
5675 if (fFcw & X86_FCW_DM)
5676 {
5677 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5678 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5679 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5680 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5681 }
5682 else
5683 {
5684 pFpuRes->r80Result = *pr80Val1;
5685 fFsw |= X86_FSW_ES | X86_FSW_B;
5686 }
5687 fFsw |= X86_FSW_DE;
5688 }
5689 /* SoftFloat can handle the rest: */
5690 else
5691 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5692
5693 pFpuRes->FSW = fFsw;
5694}
5695
5696
5697EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5698EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5699EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5700EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5701
5702
5703IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5704 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5705{
5706 uint16_t const fFcw = pFpuState->FCW;
5707 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5708
5709 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5710 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5711 {
5712 if (fFcw & X86_FCW_IM)
5713 pFpuRes->r80Result = g_r80Indefinite;
5714 else
5715 {
5716 pFpuRes->r80Result = *pr80Val1;
5717 fFsw |= X86_FSW_ES | X86_FSW_B;
5718 }
5719 fFsw |= X86_FSW_IE;
5720 }
5721 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5722 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5723 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5724 {
5725 if (fFcw & X86_FCW_DM)
5726 {
5727 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5728 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5729 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5730 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5731 }
5732 else
5733 {
5734 pFpuRes->r80Result = *pr80Val1;
5735 fFsw |= X86_FSW_ES | X86_FSW_B;
5736 }
5737 fFsw |= X86_FSW_DE;
5738 }
5739 /* SoftFloat can handle the rest: */
5740 else
5741 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5742
5743 pFpuRes->FSW = fFsw;
5744}
5745
5746
5747EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5748EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5749EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5750EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5751
5752
5753/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5754static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5755 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5756{
5757 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5758 {
5759 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5760 uint16_t fCxFlags = 0;
5761 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5762 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5763 &fCxFlags, &SoftState);
5764 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5765 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5766 if ( !(fFsw & X86_FSW_IE)
5767 && !RTFLOAT80U_IS_NAN(pr80Result)
5768 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5769 {
5770 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5771 fFsw |= fCxFlags & X86_FSW_C_MASK;
5772 }
5773 return fFsw;
5774 }
5775
5776 /* Invalid operand */
5777 if (fFcw & X86_FCW_IM)
5778 *pr80Result = g_r80Indefinite;
5779 else
5780 {
5781 *pr80Result = *pr80Val1Org;
5782 fFsw |= X86_FSW_ES | X86_FSW_B;
5783 }
5784 return fFsw | X86_FSW_IE;
5785}
5786
5787
5788static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5789 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5790{
5791 uint16_t const fFcw = pFpuState->FCW;
5792 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5793
5794 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5795 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5796 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5797 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5798 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5799 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5800 {
5801 if (fFcw & X86_FCW_IM)
5802 pFpuRes->r80Result = g_r80Indefinite;
5803 else
5804 {
5805 pFpuRes->r80Result = *pr80Val1;
5806 fFsw |= X86_FSW_ES | X86_FSW_B;
5807 }
5808 fFsw |= X86_FSW_IE;
5809 }
5810 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5811 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5812 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5813 {
5814 if (fFcw & X86_FCW_DM)
5815 {
5816 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5817 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5818 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5819 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5820 pr80Val1Org, fLegacyInstr);
5821 }
5822 else
5823 {
5824 pFpuRes->r80Result = *pr80Val1;
5825 fFsw |= X86_FSW_ES | X86_FSW_B;
5826 }
5827 fFsw |= X86_FSW_DE;
5828 }
5829 /* SoftFloat can handle the rest: */
5830 else
5831 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5832 pr80Val1, fLegacyInstr);
5833
5834 pFpuRes->FSW = fFsw;
5835}
5836
5837
5838IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5839 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5840{
5841 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5842}
5843
5844
5845IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5846 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5847{
5848 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5849}
5850
5851
5852/*********************************************************************************************************************************
5853* x87 FPU Multiplication Operations *
5854*********************************************************************************************************************************/
5855
5856/** Worker for iemAImpl_fmul_r80_by_r80. */
5857static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5858 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5859{
5860 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5861 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5862 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5863}
5864
5865
5866IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5867 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5868{
5869 uint16_t const fFcw = pFpuState->FCW;
5870 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5871
5872 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5873 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5874 {
5875 if (fFcw & X86_FCW_IM)
5876 pFpuRes->r80Result = g_r80Indefinite;
5877 else
5878 {
5879 pFpuRes->r80Result = *pr80Val1;
5880 fFsw |= X86_FSW_ES | X86_FSW_B;
5881 }
5882 fFsw |= X86_FSW_IE;
5883 }
5884 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5885 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5886 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5887 {
5888 if (fFcw & X86_FCW_DM)
5889 {
5890 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5891 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5892 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5893 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5894 }
5895 else
5896 {
5897 pFpuRes->r80Result = *pr80Val1;
5898 fFsw |= X86_FSW_ES | X86_FSW_B;
5899 }
5900 fFsw |= X86_FSW_DE;
5901 }
5902 /* SoftFloat can handle the rest: */
5903 else
5904 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5905
5906 pFpuRes->FSW = fFsw;
5907}
5908
5909
5910EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5911EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5912EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5913EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5914
5915
5916/*********************************************************************************************************************************
5917* x87 FPU Addition *
5918*********************************************************************************************************************************/
5919
5920/** Worker for iemAImpl_fadd_r80_by_r80. */
5921static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5922 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5923{
5924 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5925 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5926 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5927}
5928
5929
5930IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5931 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5932{
5933 uint16_t const fFcw = pFpuState->FCW;
5934 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5935
5936 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5937 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5938 {
5939 if (fFcw & X86_FCW_IM)
5940 pFpuRes->r80Result = g_r80Indefinite;
5941 else
5942 {
5943 pFpuRes->r80Result = *pr80Val1;
5944 fFsw |= X86_FSW_ES | X86_FSW_B;
5945 }
5946 fFsw |= X86_FSW_IE;
5947 }
5948 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5949 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5950 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5951 {
5952 if (fFcw & X86_FCW_DM)
5953 {
5954 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5955 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5956 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5957 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5958 }
5959 else
5960 {
5961 pFpuRes->r80Result = *pr80Val1;
5962 fFsw |= X86_FSW_ES | X86_FSW_B;
5963 }
5964 fFsw |= X86_FSW_DE;
5965 }
5966 /* SoftFloat can handle the rest: */
5967 else
5968 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5969
5970 pFpuRes->FSW = fFsw;
5971}
5972
5973
5974EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
5975EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
5976EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
5977EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
5978
5979
5980/*********************************************************************************************************************************
5981* x87 FPU Subtraction *
5982*********************************************************************************************************************************/
5983
5984/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
5985static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5986 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5987{
5988 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5989 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5990 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5991}
5992
5993
5994IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5995 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5996{
5997 uint16_t const fFcw = pFpuState->FCW;
5998 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5999
6000 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6001 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6002 {
6003 if (fFcw & X86_FCW_IM)
6004 pFpuRes->r80Result = g_r80Indefinite;
6005 else
6006 {
6007 pFpuRes->r80Result = *pr80Val1;
6008 fFsw |= X86_FSW_ES | X86_FSW_B;
6009 }
6010 fFsw |= X86_FSW_IE;
6011 }
6012 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6013 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6014 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6015 {
6016 if (fFcw & X86_FCW_DM)
6017 {
6018 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6019 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6020 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6021 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6022 }
6023 else
6024 {
6025 pFpuRes->r80Result = *pr80Val1;
6026 fFsw |= X86_FSW_ES | X86_FSW_B;
6027 }
6028 fFsw |= X86_FSW_DE;
6029 }
6030 /* SoftFloat can handle the rest: */
6031 else
6032 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6033
6034 pFpuRes->FSW = fFsw;
6035}
6036
6037
6038EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6039EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6040EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6041EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6042
6043
6044/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6045IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6046 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6047{
6048 uint16_t const fFcw = pFpuState->FCW;
6049 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6050
6051 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6052 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6053 {
6054 if (fFcw & X86_FCW_IM)
6055 pFpuRes->r80Result = g_r80Indefinite;
6056 else
6057 {
6058 pFpuRes->r80Result = *pr80Val1;
6059 fFsw |= X86_FSW_ES | X86_FSW_B;
6060 }
6061 fFsw |= X86_FSW_IE;
6062 }
6063 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6064 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6065 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6066 {
6067 if (fFcw & X86_FCW_DM)
6068 {
6069 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6070 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6071 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6072 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6073 }
6074 else
6075 {
6076 pFpuRes->r80Result = *pr80Val1;
6077 fFsw |= X86_FSW_ES | X86_FSW_B;
6078 }
6079 fFsw |= X86_FSW_DE;
6080 }
6081 /* SoftFloat can handle the rest: */
6082 else
6083 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6084
6085 pFpuRes->FSW = fFsw;
6086}
6087
6088
6089EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6090EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6091EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6092EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6093
6094
6095/*********************************************************************************************************************************
6096* x87 FPU Trigometric Operations *
6097*********************************************************************************************************************************/
6098
6099
6100IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6101 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6102{
6103 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6104 AssertReleaseFailed();
6105}
6106
6107#endif /* IEM_WITHOUT_ASSEMBLY */
6108
6109IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6110 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6111{
6112 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6113}
6114
6115IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6116 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6117{
6118 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6119}
6120
6121
6122#if defined(IEM_WITHOUT_ASSEMBLY)
6123IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6124{
6125 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6126 AssertReleaseFailed();
6127}
6128#endif /* IEM_WITHOUT_ASSEMBLY */
6129
6130IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6131{
6132 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6133}
6134
6135IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6136{
6137 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6138}
6139
6140
6141#ifdef IEM_WITHOUT_ASSEMBLY
6142IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6143{
6144 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6145 AssertReleaseFailed();
6146}
6147#endif /* IEM_WITHOUT_ASSEMBLY */
6148
6149IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6150{
6151 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6152}
6153
6154IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6155{
6156 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6157}
6158
6159#ifdef IEM_WITHOUT_ASSEMBLY
6160IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6161{
6162 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6163 AssertReleaseFailed();
6164}
6165#endif /* IEM_WITHOUT_ASSEMBLY */
6166
6167IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6168{
6169 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6170}
6171
6172IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6173{
6174 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6175}
6176
6177
6178#ifdef IEM_WITHOUT_ASSEMBLY
6179IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6180{
6181 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6182 AssertReleaseFailed();
6183}
6184#endif /* IEM_WITHOUT_ASSEMBLY */
6185
6186IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6187{
6188 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6189}
6190
6191IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6192{
6193 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6194}
6195
6196#ifdef IEM_WITHOUT_ASSEMBLY
6197
6198
6199/*********************************************************************************************************************************
6200* x87 FPU Compare and Testing Operations *
6201*********************************************************************************************************************************/
6202
6203IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6204{
6205 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6206
6207 if (RTFLOAT80U_IS_ZERO(pr80Val))
6208 fFsw |= X86_FSW_C3;
6209 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6210 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6211 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6212 {
6213 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6214 if (!(pFpuState->FCW & X86_FCW_DM))
6215 fFsw |= X86_FSW_ES | X86_FSW_B;
6216 }
6217 else
6218 {
6219 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6220 if (!(pFpuState->FCW & X86_FCW_IM))
6221 fFsw |= X86_FSW_ES | X86_FSW_B;
6222 }
6223
6224 *pu16Fsw = fFsw;
6225}
6226
6227
6228IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6229{
6230 RT_NOREF(pFpuState);
6231 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6232
6233 /* C1 = sign bit (always, even if empty Intel says). */
6234 if (pr80Val->s.fSign)
6235 fFsw |= X86_FSW_C1;
6236
6237 /* Classify the value in C0, C2, C3. */
6238 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6239 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6240 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6241 fFsw |= X86_FSW_C2;
6242 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6243 fFsw |= X86_FSW_C3;
6244 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6245 fFsw |= X86_FSW_C0;
6246 else if (RTFLOAT80U_IS_INF(pr80Val))
6247 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6248 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6249 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6250 /* whatever else: 0 */
6251
6252 *pu16Fsw = fFsw;
6253}
6254
6255
6256/**
6257 * Worker for fcom, fucom, and friends.
6258 */
6259static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6260 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6261{
6262 /*
6263 * Unpack the values.
6264 */
6265 bool const fSign1 = pr80Val1->s.fSign;
6266 int32_t iExponent1 = pr80Val1->s.uExponent;
6267 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6268
6269 bool const fSign2 = pr80Val2->s.fSign;
6270 int32_t iExponent2 = pr80Val2->s.uExponent;
6271 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6272
6273 /*
6274 * Check for invalid inputs.
6275 */
6276 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6277 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6278 {
6279 if (!(fFcw & X86_FCW_IM))
6280 fFsw |= X86_FSW_ES | X86_FSW_B;
6281 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6282 }
6283
6284 /*
6285 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6286 */
6287 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6288 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6289 {
6290 if ( fIeOnAllNaNs
6291 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6292 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6293 {
6294 fFsw |= X86_FSW_IE;
6295 if (!(fFcw & X86_FCW_IM))
6296 fFsw |= X86_FSW_ES | X86_FSW_B;
6297 }
6298 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6299 }
6300
6301 /*
6302 * Normalize the values.
6303 */
6304 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6305 {
6306 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6307 iExponent1 = 1;
6308 else
6309 {
6310 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6311 uMantissa1 <<= iExponent1;
6312 iExponent1 = 1 - iExponent1;
6313 }
6314 fFsw |= X86_FSW_DE;
6315 if (!(fFcw & X86_FCW_DM))
6316 fFsw |= X86_FSW_ES | X86_FSW_B;
6317 }
6318
6319 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6320 {
6321 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6322 iExponent2 = 1;
6323 else
6324 {
6325 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6326 uMantissa2 <<= iExponent2;
6327 iExponent2 = 1 - iExponent2;
6328 }
6329 fFsw |= X86_FSW_DE;
6330 if (!(fFcw & X86_FCW_DM))
6331 fFsw |= X86_FSW_ES | X86_FSW_B;
6332 }
6333
6334 /*
6335 * Test if equal (val1 == val2):
6336 */
6337 if ( uMantissa1 == uMantissa2
6338 && iExponent1 == iExponent2
6339 && ( fSign1 == fSign2
6340 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6341 fFsw |= X86_FSW_C3;
6342 /*
6343 * Test if less than (val1 < val2):
6344 */
6345 else if (fSign1 && !fSign2)
6346 fFsw |= X86_FSW_C0;
6347 else if (fSign1 == fSign2)
6348 {
6349 /* Zeros are problematic, however at the most one can be zero here. */
6350 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6351 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6352 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6353 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6354
6355 if ( fSign1
6356 ^ ( iExponent1 < iExponent2
6357 || ( iExponent1 == iExponent2
6358 && uMantissa1 < uMantissa2 ) ) )
6359 fFsw |= X86_FSW_C0;
6360 }
6361 /* else: No flags set if greater. */
6362
6363 return fFsw;
6364}
6365
6366
6367IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6368 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6369{
6370 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6371}
6372
6373
6374
6375
6376IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6377 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6378{
6379 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6380}
6381
6382
6383IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6384 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6385{
6386 RTFLOAT80U r80Val2;
6387 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6388 Assert(!fFsw || fFsw == X86_FSW_DE);
6389 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6390 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6391 {
6392 if (!(pFpuState->FCW & X86_FCW_DM))
6393 fFsw |= X86_FSW_ES | X86_FSW_B;
6394 *pfFsw |= fFsw;
6395 }
6396}
6397
6398
6399IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6400 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6401{
6402 RTFLOAT80U r80Val2;
6403 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6404 Assert(!fFsw || fFsw == X86_FSW_DE);
6405 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6406 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6407 {
6408 if (!(pFpuState->FCW & X86_FCW_DM))
6409 fFsw |= X86_FSW_ES | X86_FSW_B;
6410 *pfFsw |= fFsw;
6411 }
6412}
6413
6414
6415IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6416 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6417{
6418 RTFLOAT80U r80Val2;
6419 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6420 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6421}
6422
6423
6424IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6425 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6426{
6427 RTFLOAT80U r80Val2;
6428 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6429 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6430}
6431
6432
6433/**
6434 * Worker for fcomi & fucomi.
6435 */
6436static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6437 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6438{
6439 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6440 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6441 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6442 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6443
6444 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6445 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6446 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6447}
6448
6449
6450IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6451 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6452{
6453 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6454}
6455
6456
6457IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6458 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6459{
6460 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6461}
6462
6463
6464/*********************************************************************************************************************************
6465* x87 FPU Other Operations *
6466*********************************************************************************************************************************/
6467
6468/**
6469 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6470 */
6471static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6472{
6473 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6474 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6475 true /*exact / generate #PE */, &SoftState));
6476 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6477}
6478
6479
6480IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6481{
6482 uint16_t const fFcw = pFpuState->FCW;
6483 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6484
6485 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6486 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6487 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6488 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6489 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6490 || RTFLOAT80U_IS_INF(pr80Val))
6491 pFpuRes->r80Result = *pr80Val;
6492 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6493 {
6494 fFsw |= X86_FSW_DE;
6495 if (fFcw & X86_FCW_DM)
6496 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6497 else
6498 {
6499 pFpuRes->r80Result = *pr80Val;
6500 fFsw |= X86_FSW_ES | X86_FSW_B;
6501 }
6502 }
6503 else
6504 {
6505 if (fFcw & X86_FCW_IM)
6506 {
6507 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6508 pFpuRes->r80Result = g_r80Indefinite;
6509 else
6510 {
6511 pFpuRes->r80Result = *pr80Val;
6512 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6513 }
6514 }
6515 else
6516 {
6517 pFpuRes->r80Result = *pr80Val;
6518 fFsw |= X86_FSW_ES | X86_FSW_B;
6519 }
6520 fFsw |= X86_FSW_IE;
6521 }
6522 pFpuRes->FSW = fFsw;
6523}
6524
6525
6526IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6527 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6528{
6529 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6530 it does everything we need it to do. */
6531 uint16_t const fFcw = pFpuState->FCW;
6532 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6533 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6534 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6535 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6536}
6537
6538
6539/**
6540 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6541 */
6542static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6543{
6544 Assert(!pr80Val->s.fSign);
6545 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6546 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6547 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6548}
6549
6550
6551IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6552{
6553 uint16_t const fFcw = pFpuState->FCW;
6554 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6555
6556 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6557 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6558 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6559 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6560 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6561 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6562 pFpuRes->r80Result = *pr80Val;
6563 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6564 {
6565 fFsw |= X86_FSW_DE;
6566 if (fFcw & X86_FCW_DM)
6567 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6568 else
6569 {
6570 pFpuRes->r80Result = *pr80Val;
6571 fFsw |= X86_FSW_ES | X86_FSW_B;
6572 }
6573 }
6574 else
6575 {
6576 if (fFcw & X86_FCW_IM)
6577 {
6578 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6579 pFpuRes->r80Result = g_r80Indefinite;
6580 else
6581 {
6582 pFpuRes->r80Result = *pr80Val;
6583 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6584 }
6585 }
6586 else
6587 {
6588 pFpuRes->r80Result = *pr80Val;
6589 fFsw |= X86_FSW_ES | X86_FSW_B;
6590 }
6591 fFsw |= X86_FSW_IE;
6592 }
6593 pFpuRes->FSW = fFsw;
6594}
6595
6596
6597/**
6598 * @code{.unparsed}
6599 * x x * ln2
6600 * f(x) = 2 - 1 = e - 1
6601 *
6602 * @endcode
6603 *
6604 * We can approximate e^x by a Taylor/Maclaurin series (see
6605 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
6606 * @code{.unparsed}
6607 * n 0 1 2 3 4
6608 * inf x x x x x x
6609 * SUM ----- = --- + --- + --- + --- + --- + ...
6610 * n=0 n! 0! 1! 2! 3! 4!
6611 *
6612 * 2 3 4
6613 * x x x
6614 * = 1 + x + --- + --- + --- + ...
6615 * 2! 3! 4!
6616 * @endcode
6617 *
6618 * Given z = x * ln2, we get:
6619 * @code{.unparsed}
6620 * 2 3 4 n
6621 * z z z z z
6622 * e - 1 = z + --- + --- + --- + ... + ---
6623 * 2! 3! 4! n!
6624 * @endcode
6625 *
6626 * Wanting to use Horner's method, we move one z outside and get:
6627 * @code{.unparsed}
6628 * 2 3 (n-1)
6629 * z z z z
6630 * = z ( 1 + --- + --- + --- + ... + ------- )
6631 * 2! 3! 4! n!
6632 * @endcode
6633 *
6634 * The constants we need for using Horner's methods are 1 and 1 / n!.
6635 *
6636 * For very tiny x values, we can get away with f(x) = x * ln 2, because
6637 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
6638 * and can approximate it to be 1.0. For a visual demonstration of this
6639 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
6640 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
6641 *
6642 *
6643 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
6644 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
6645 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
6646 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
6647 * blocks). (The one bit difference is probably an implicit one missing from
6648 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
6649 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
6650 * exponent.
6651 *
6652 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
6653 * successfully reproduced the exact results from an Intel 10980XE, there is
6654 * always a portition of rounding differences. Not going to spend too much time
6655 * on getting this 100% the same, at least not now.
6656 *
6657 * P.S. If someone are really curious about 8087 and its contstants:
6658 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
6659 *
6660 *
6661 * @param pr80Val The exponent value (x), less than 1.0, greater than
6662 * -1.0 and not zero. This can be a normal, denormal
6663 * or pseudo-denormal value.
6664 * @param pr80Result Where to return the result.
6665 * @param fFcw FPU control word.
6666 * @param fFsw FPU status word.
6667 */
6668static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6669{
6670 /* As mentioned above, we can skip the expensive polynomial calculation
6671 as it will be close enough to 1.0 that it makes no difference.
6672
6673 The cutoff point for intel 10980XE is exponents >= -69. Intel
6674 also seems to be using a 67-bit or 68-bit constant value, and we get
6675 a smattering of rounding differences if we go for higher precision. */
6676 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
6677 {
6678 RTUINT256U u256;
6679 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
6680 u256.QWords.qw0 |= 1; /* force #PE */
6681 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
6682 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
6683 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
6684 : 1 - RTFLOAT80U_EXP_BIAS,
6685 fFcw, fFsw);
6686 }
6687 else
6688 {
6689#ifdef IEM_WITH_FLOAT128_FOR_FPU
6690 /* This approach is not good enough for small values, we end up with zero. */
6691 int const fOldRounding = iemFpuF128SetRounding(fFcw);
6692 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
6693 _Float128 rd128Result = powf128(2.0L, rd128Val);
6694 rd128Result -= 1.0L;
6695 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
6696 iemFpuF128RestoreRounding(fOldRounding);
6697
6698# else
6699 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6700 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
6701
6702 /* As mentioned above, enforce 68-bit internal mantissa width to better
6703 match the Intel 10980XE results. */
6704 unsigned const cPrecision = 68;
6705
6706 /* first calculate z = x * ln2 */
6707 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
6708 cPrecision);
6709
6710 /* Then do the polynomial evaluation. */
6711 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
6712 cPrecision, &SoftState);
6713 r = f128_mul(z, r, &SoftState);
6714
6715 /* Output the result. */
6716 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
6717# endif
6718 }
6719 return fFsw;
6720}
6721
6722
6723IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6724{
6725 uint16_t const fFcw = pFpuState->FCW;
6726 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6727
6728 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6729 {
6730 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
6731 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6732 else
6733 {
6734 /* Special case:
6735 2^+1.0 - 1.0 = 1.0
6736 2^-1.0 - 1.0 = -0.5 */
6737 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
6738 && pr80Val->s.uMantissa == RT_BIT_64(63))
6739 {
6740 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
6741 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
6742 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6743 }
6744 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
6745 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
6746 else
6747 pFpuRes->r80Result = *pr80Val;
6748 fFsw |= X86_FSW_PE;
6749 if (!(fFcw & X86_FCW_PM))
6750 fFsw |= X86_FSW_ES | X86_FSW_B;
6751 }
6752 }
6753 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6754 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6755 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6756 pFpuRes->r80Result = *pr80Val;
6757 else if (RTFLOAT80U_IS_INF(pr80Val))
6758 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
6759 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6760 {
6761 fFsw |= X86_FSW_DE;
6762 if (fFcw & X86_FCW_DM)
6763 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6764 else
6765 {
6766 pFpuRes->r80Result = *pr80Val;
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 }
6770 else
6771 {
6772 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6773 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6774 && (fFcw & X86_FCW_IM))
6775 pFpuRes->r80Result = g_r80Indefinite;
6776 else
6777 {
6778 pFpuRes->r80Result = *pr80Val;
6779 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6780 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6781 }
6782 fFsw |= X86_FSW_IE;
6783 if (!(fFcw & X86_FCW_IM))
6784 fFsw |= X86_FSW_ES | X86_FSW_B;
6785 }
6786 pFpuRes->FSW = fFsw;
6787}
6788
6789#endif /* IEM_WITHOUT_ASSEMBLY */
6790
6791IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6792{
6793 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6794}
6795
6796IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6797{
6798 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6799}
6800
6801#ifdef IEM_WITHOUT_ASSEMBLY
6802
6803IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6804{
6805 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6806 pFpuRes->r80Result = *pr80Val;
6807 pFpuRes->r80Result.s.fSign = 0;
6808}
6809
6810
6811IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6812{
6813 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6814 pFpuRes->r80Result = *pr80Val;
6815 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
6816}
6817
6818
6819IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6820{
6821 uint16_t const fFcw = pFpuState->FCW;
6822 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6823
6824 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6825 {
6826 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6827 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
6828
6829 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6830 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6831 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6832 }
6833 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6834 {
6835 fFsw |= X86_FSW_ZE;
6836 if (fFcw & X86_FCW_ZM)
6837 {
6838 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
6839 pFpuResTwo->r80Result2 = *pr80Val;
6840 }
6841 else
6842 {
6843 pFpuResTwo->r80Result2 = *pr80Val;
6844 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6845 }
6846 }
6847 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6848 {
6849 fFsw |= X86_FSW_DE;
6850 if (fFcw & X86_FCW_DM)
6851 {
6852 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6853 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6854 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6855 int32_t iExponent = -16382;
6856 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
6857 {
6858 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
6859 iExponent--;
6860 }
6861
6862 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6863 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
6864 }
6865 else
6866 {
6867 pFpuResTwo->r80Result2 = *pr80Val;
6868 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6869 }
6870 }
6871 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6872 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6873 {
6874 pFpuResTwo->r80Result1 = *pr80Val;
6875 pFpuResTwo->r80Result2 = *pr80Val;
6876 }
6877 else if (RTFLOAT80U_IS_INF(pr80Val))
6878 {
6879 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
6880 pFpuResTwo->r80Result2 = *pr80Val;
6881 }
6882 else
6883 {
6884 if (fFcw & X86_FCW_IM)
6885 {
6886 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6887 pFpuResTwo->r80Result1 = g_r80Indefinite;
6888 else
6889 {
6890 pFpuResTwo->r80Result1 = *pr80Val;
6891 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6892 }
6893 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
6894 }
6895 else
6896 {
6897 pFpuResTwo->r80Result2 = *pr80Val;
6898 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6899 }
6900 fFsw |= X86_FSW_IE;
6901 }
6902 pFpuResTwo->FSW = fFsw;
6903}
6904
6905
6906IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6907 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6908{
6909 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6910 AssertReleaseFailed();
6911}
6912
6913#endif /* IEM_WITHOUT_ASSEMBLY */
6914
6915IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6916 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6917{
6918 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6919}
6920
6921IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6922 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6923{
6924 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6925}
6926
6927#if defined(IEM_WITHOUT_ASSEMBLY)
6928
6929IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6930 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6931{
6932 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6933 AssertReleaseFailed();
6934}
6935
6936#endif /* IEM_WITHOUT_ASSEMBLY */
6937
6938IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6939 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6940{
6941 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6942}
6943
6944IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6945 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6946{
6947 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6948}
6949
6950
6951/*********************************************************************************************************************************
6952* MMX, SSE & AVX *
6953*********************************************************************************************************************************/
6954
6955/*
6956 * MOVSLDUP / VMOVSLDUP
6957 */
6958IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
6959{
6960 puDst->au32[0] = puSrc->au32[0];
6961 puDst->au32[1] = puSrc->au32[0];
6962 puDst->au32[2] = puSrc->au32[2];
6963 puDst->au32[3] = puSrc->au32[2];
6964}
6965
6966#ifdef IEM_WITH_VEX
6967
6968IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
6969{
6970 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
6971 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
6972 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
6973 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
6974 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6975 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6976 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6977 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6978}
6979
6980
6981IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
6982{
6983 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
6984 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
6985 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
6986 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
6987 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
6988 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
6989 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
6990 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
6991}
6992
6993#endif /* IEM_WITH_VEX */
6994
6995
6996/*
6997 * MOVSHDUP / VMOVSHDUP
6998 */
6999IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7000{
7001 puDst->au32[0] = puSrc->au32[1];
7002 puDst->au32[1] = puSrc->au32[1];
7003 puDst->au32[2] = puSrc->au32[3];
7004 puDst->au32[3] = puSrc->au32[3];
7005}
7006
7007#ifdef IEM_WITH_VEX
7008
7009IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7010{
7011 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7012 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7013 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7014 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7015 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7016 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7017 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7018 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7019}
7020
7021
7022IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7023{
7024 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7025 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7026 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7027 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7028 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7029 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7030 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7031 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7032}
7033
7034#endif /* IEM_WITH_VEX */
7035
7036
7037/*
7038 * MOVDDUP / VMOVDDUP
7039 */
7040IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7041{
7042 puDst->au64[0] = uSrc;
7043 puDst->au64[1] = uSrc;
7044}
7045
7046#ifdef IEM_WITH_VEX
7047
7048IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7049{
7050 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7051 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7052 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7053 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7054}
7055
7056IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7057{
7058 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7059 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7060 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7061 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7062}
7063
7064#endif /* IEM_WITH_VEX */
7065
7066
7067/*
7068 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7069 */
7070#ifdef IEM_WITHOUT_ASSEMBLY
7071
7072IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7073{
7074 RT_NOREF(pFpuState);
7075 *puDst &= *puSrc;
7076}
7077
7078
7079IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7080{
7081 RT_NOREF(pFpuState);
7082 puDst->au64[0] &= puSrc->au64[0];
7083 puDst->au64[1] &= puSrc->au64[1];
7084}
7085
7086#endif
7087
7088IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7089 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7090{
7091 RT_NOREF(pExtState);
7092 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7093 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7094}
7095
7096
7097IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7098 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7099{
7100 RT_NOREF(pExtState);
7101 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7102 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7103 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7104 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7105}
7106
7107
7108/*
7109 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7110 */
7111#ifdef IEM_WITHOUT_ASSEMBLY
7112
7113IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7114{
7115 RT_NOREF(pFpuState);
7116 *puDst = ~*puDst & *puSrc;
7117}
7118
7119
7120IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7121{
7122 RT_NOREF(pFpuState);
7123 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7124 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7125}
7126
7127#endif
7128
7129IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7130 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7131{
7132 RT_NOREF(pExtState);
7133 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7134 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7135}
7136
7137
7138IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7139 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7140{
7141 RT_NOREF(pExtState);
7142 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7143 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7144 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7145 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7146}
7147
7148
7149/*
7150 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7151 */
7152#ifdef IEM_WITHOUT_ASSEMBLY
7153
7154IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7155{
7156 RT_NOREF(pFpuState);
7157 *puDst |= *puSrc;
7158}
7159
7160
7161IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7162{
7163 RT_NOREF(pFpuState);
7164 puDst->au64[0] |= puSrc->au64[0];
7165 puDst->au64[1] |= puSrc->au64[1];
7166}
7167
7168#endif
7169
7170IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7171 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7172{
7173 RT_NOREF(pExtState);
7174 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7175 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7176}
7177
7178
7179IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7180 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7181{
7182 RT_NOREF(pExtState);
7183 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7184 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7185 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7186 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7187}
7188
7189
7190/*
7191 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7192 */
7193#ifdef IEM_WITHOUT_ASSEMBLY
7194
7195IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7196{
7197 RT_NOREF(pFpuState);
7198 *puDst ^= *puSrc;
7199}
7200
7201
7202IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7203{
7204 RT_NOREF(pFpuState);
7205 puDst->au64[0] ^= puSrc->au64[0];
7206 puDst->au64[1] ^= puSrc->au64[1];
7207}
7208
7209#endif
7210
7211IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7212 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7213{
7214 RT_NOREF(pExtState);
7215 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7216 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7217}
7218
7219
7220IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7221 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7222{
7223 RT_NOREF(pExtState);
7224 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7225 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7226 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7227 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7228}
7229
7230
7231/*
7232 * PCMPEQB / VPCMPEQB
7233 */
7234#ifdef IEM_WITHOUT_ASSEMBLY
7235
7236IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7237{
7238 RT_NOREF(pFpuState);
7239 RTUINT64U uSrc1 = { *puDst };
7240 RTUINT64U uSrc2 = { *puSrc };
7241 RTUINT64U uDst;
7242 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7243 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7244 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7245 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7246 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7247 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7248 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7249 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7250 *puDst = uDst.u;
7251}
7252
7253
7254IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7255{
7256 RT_NOREF(pFpuState);
7257 RTUINT128U uSrc1 = *puDst;
7258 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7259 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7260 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7261 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7262 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7263 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7264 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7265 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7266 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7267 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7268 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7269 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7270 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7271 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7272 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7273 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7274}
7275
7276#endif
7277
7278IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7279 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7280{
7281 RT_NOREF(pExtState);
7282 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7283 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7284 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7285 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7286 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7287 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7288 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7289 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7290 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7291 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7292 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7293 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7294 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7295 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7296 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7297 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7298}
7299
7300IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7301 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7302{
7303 RT_NOREF(pExtState);
7304 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7305 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7306 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7307 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7308 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7309 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7310 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7311 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7312 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7313 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7314 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7315 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7316 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7317 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7318 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7319 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7320 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7321 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7322 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7323 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7324 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7325 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7326 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7327 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7328 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7329 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7330 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7331 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7332 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7333 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7334 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7335 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7336}
7337
7338
7339/*
7340 * PCMPEQW / VPCMPEQW
7341 */
7342#ifdef IEM_WITHOUT_ASSEMBLY
7343
7344IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7345{
7346 RT_NOREF(pFpuState);
7347 RTUINT64U uSrc1 = { *puDst };
7348 RTUINT64U uSrc2 = { *puSrc };
7349 RTUINT64U uDst;
7350 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7351 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7352 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7353 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7354 *puDst = uDst.u;
7355}
7356
7357
7358IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7359{
7360 RT_NOREF(pFpuState);
7361 RTUINT128U uSrc1 = *puDst;
7362 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7363 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7364 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7365 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7366 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7367 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7368 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7369 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7370}
7371
7372#endif
7373
7374IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7375 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7376{
7377 RT_NOREF(pExtState);
7378 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7379 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7380 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7381 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7382 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7383 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7384 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7385 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7386}
7387
7388IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7389 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7390{
7391 RT_NOREF(pExtState);
7392 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7393 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7394 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7395 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7396 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7397 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7398 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7399 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7400 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7401 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7402 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7403 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7404 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7405 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7406 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7407 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7408}
7409
7410
7411/*
7412 * PCMPEQD / VPCMPEQD.
7413 */
7414#ifdef IEM_WITHOUT_ASSEMBLY
7415
7416IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7417{
7418 RT_NOREF(pFpuState);
7419 RTUINT64U uSrc1 = { *puDst };
7420 RTUINT64U uSrc2 = { *puSrc };
7421 RTUINT64U uDst;
7422 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7423 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7424 *puDst = uDst.u;
7425}
7426
7427
7428IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7429{
7430 RT_NOREF(pFpuState);
7431 RTUINT128U uSrc1 = *puDst;
7432 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7433 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7434 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7435 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7436}
7437
7438#endif /* IEM_WITHOUT_ASSEMBLY */
7439
7440IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7441 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7442{
7443 RT_NOREF(pExtState);
7444 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7445 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7446 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7447 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7448}
7449
7450IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7451 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7452{
7453 RT_NOREF(pExtState);
7454 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7455 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7456 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7457 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7458 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
7459 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
7460 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
7461 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
7462}
7463
7464
7465/*
7466 * PCMPEQQ / VPCMPEQQ.
7467 */
7468IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7469{
7470 RT_NOREF(pFpuState);
7471 RTUINT128U uSrc1 = *puDst;
7472 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
7473 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
7474}
7475
7476IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7477 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7478{
7479 RT_NOREF(pExtState);
7480 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7481 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7482}
7483
7484IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7485 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7486{
7487 RT_NOREF(pExtState);
7488 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7489 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7490 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
7491 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
7492}
7493
7494
7495/*
7496 * PCMPGTB / VPCMPGTB
7497 */
7498#ifdef IEM_WITHOUT_ASSEMBLY
7499
7500IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7501{
7502 RT_NOREF(pFpuState);
7503 RTUINT64U uSrc1 = { *puDst };
7504 RTUINT64U uSrc2 = { *puSrc };
7505 RTUINT64U uDst;
7506 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
7507 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
7508 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
7509 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
7510 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
7511 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
7512 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
7513 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
7514 *puDst = uDst.u;
7515}
7516
7517
7518IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7519{
7520 RT_NOREF(pFpuState);
7521 RTUINT128U uSrc1 = *puDst;
7522 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
7523 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
7524 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
7525 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
7526 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
7527 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
7528 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
7529 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
7530 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
7531 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
7532 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
7533 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
7534 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
7535 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
7536 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
7537 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
7538}
7539
7540#endif
7541
7542IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7543 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7544{
7545 RT_NOREF(pExtState);
7546 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7547 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7548 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7549 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7550 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7551 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7552 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7553 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7554 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7555 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7556 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7557 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7558 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7559 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7560 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7561 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7562}
7563
7564IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7565 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7566{
7567 RT_NOREF(pExtState);
7568 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7569 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7570 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7571 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7572 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7573 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7574 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7575 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7576 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7577 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7578 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7579 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7580 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7581 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7582 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7583 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7584 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
7585 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
7586 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
7587 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
7588 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
7589 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
7590 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
7591 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
7592 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
7593 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
7594 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
7595 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
7596 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
7597 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
7598 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
7599 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
7600}
7601
7602
7603/*
7604 * PCMPGTW / VPCMPGTW
7605 */
7606#ifdef IEM_WITHOUT_ASSEMBLY
7607
7608IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7609{
7610 RT_NOREF(pFpuState);
7611 RTUINT64U uSrc1 = { *puDst };
7612 RTUINT64U uSrc2 = { *puSrc };
7613 RTUINT64U uDst;
7614 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
7615 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
7616 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
7617 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
7618 *puDst = uDst.u;
7619}
7620
7621
7622IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7623{
7624 RT_NOREF(pFpuState);
7625 RTUINT128U uSrc1 = *puDst;
7626 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
7627 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
7628 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
7629 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
7630 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
7631 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
7632 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
7633 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
7634}
7635
7636#endif
7637
7638IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7639 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7640{
7641 RT_NOREF(pExtState);
7642 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7643 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7644 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7645 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7646 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7647 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7648 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7649 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7650}
7651
7652IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7653 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7654{
7655 RT_NOREF(pExtState);
7656 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7657 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7658 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7659 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7660 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7661 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7662 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7663 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7664 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
7665 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
7666 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
7667 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
7668 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
7669 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
7670 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
7671 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
7672}
7673
7674
7675/*
7676 * PCMPGTD / VPCMPGTD.
7677 */
7678#ifdef IEM_WITHOUT_ASSEMBLY
7679
7680IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7681{
7682 RT_NOREF(pFpuState);
7683 RTUINT64U uSrc1 = { *puDst };
7684 RTUINT64U uSrc2 = { *puSrc };
7685 RTUINT64U uDst;
7686 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
7687 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
7688 *puDst = uDst.u;
7689}
7690
7691
7692IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7693{
7694 RT_NOREF(pFpuState);
7695 RTUINT128U uSrc1 = *puDst;
7696 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
7697 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
7698 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
7699 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
7700}
7701
7702#endif /* IEM_WITHOUT_ASSEMBLY */
7703
7704IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7705 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7706{
7707 RT_NOREF(pExtState);
7708 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7709 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7710 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7711 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7712}
7713
7714IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7715 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7716{
7717 RT_NOREF(pExtState);
7718 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7719 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7720 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7721 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7722 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
7723 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
7724 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
7725 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
7726}
7727
7728
7729/*
7730 * PCMPGTQ / VPCMPGTQ.
7731 */
7732IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7733{
7734 RT_NOREF(pFpuState);
7735 RTUINT128U uSrc1 = *puDst;
7736 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
7737 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
7738}
7739
7740IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7741 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7742{
7743 RT_NOREF(pExtState);
7744 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7745 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7746}
7747
7748IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7749 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7750{
7751 RT_NOREF(pExtState);
7752 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7753 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7754 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
7755 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
7756}
7757
7758
7759/*
7760 * PADDB / VPADDB
7761 */
7762#ifdef IEM_WITHOUT_ASSEMBLY
7763
7764IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7765{
7766 RT_NOREF(pFpuState);
7767 RTUINT64U uSrc1 = { *puDst };
7768 RTUINT64U uSrc2 = { *puSrc };
7769 RTUINT64U uDst;
7770 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
7771 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
7772 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
7773 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
7774 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
7775 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
7776 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
7777 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
7778 *puDst = uDst.u;
7779}
7780
7781
7782IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7783{
7784 RT_NOREF(pFpuState);
7785 RTUINT128U uSrc1 = *puDst;
7786 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
7787 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
7788 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
7789 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
7790 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
7791 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
7792 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
7793 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
7794 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
7795 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
7796 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
7797 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
7798 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
7799 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
7800 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
7801 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
7802}
7803
7804#endif
7805
7806IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7807 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7808{
7809 RT_NOREF(pExtState);
7810 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7811 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7812 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7813 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7814 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7815 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7816 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7817 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7818 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7819 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7820 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7821 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7822 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7823 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7824 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7825 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7826}
7827
7828IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7829 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7830{
7831 RT_NOREF(pExtState);
7832 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7833 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7834 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7835 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7836 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7837 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7838 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7839 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7840 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7841 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7842 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7843 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7844 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7845 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7846 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7847 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7848 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
7849 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
7850 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
7851 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
7852 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
7853 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
7854 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
7855 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
7856 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
7857 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
7858 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
7859 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
7860 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
7861 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
7862 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
7863 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
7864}
7865
7866
7867/*
7868 * PADDW / VPADDW
7869 */
7870#ifdef IEM_WITHOUT_ASSEMBLY
7871
7872IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7873{
7874 RT_NOREF(pFpuState);
7875 RTUINT64U uSrc1 = { *puDst };
7876 RTUINT64U uSrc2 = { *puSrc };
7877 RTUINT64U uDst;
7878 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
7879 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
7880 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
7881 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
7882 *puDst = uDst.u;
7883}
7884
7885
7886IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7887{
7888 RT_NOREF(pFpuState);
7889 RTUINT128U uSrc1 = *puDst;
7890 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
7891 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
7892 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
7893 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
7894 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
7895 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
7896 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
7897 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
7898}
7899
7900#endif
7901
7902IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7903 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7904{
7905 RT_NOREF(pExtState);
7906 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
7907 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
7908 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
7909 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
7910 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
7911 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
7912 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
7913 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
7914}
7915
7916IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7917 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7918{
7919 RT_NOREF(pExtState);
7920 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
7921 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
7922 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
7923 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
7924 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
7925 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
7926 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
7927 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
7928 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
7929 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
7930 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
7931 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
7932 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
7933 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
7934 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
7935 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
7936}
7937
7938
7939/*
7940 * PADDD / VPADDD.
7941 */
7942#ifdef IEM_WITHOUT_ASSEMBLY
7943
7944IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7945{
7946 RT_NOREF(pFpuState);
7947 RTUINT64U uSrc1 = { *puDst };
7948 RTUINT64U uSrc2 = { *puSrc };
7949 RTUINT64U uDst;
7950 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
7951 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
7952 *puDst = uDst.u;
7953}
7954
7955
7956IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7957{
7958 RT_NOREF(pFpuState);
7959 RTUINT128U uSrc1 = *puDst;
7960 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
7961 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
7962 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
7963 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
7964}
7965
7966#endif /* IEM_WITHOUT_ASSEMBLY */
7967
7968IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7969 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7970{
7971 RT_NOREF(pExtState);
7972 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
7973 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
7974 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
7975 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
7976}
7977
7978IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7979 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7980{
7981 RT_NOREF(pExtState);
7982 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
7983 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
7984 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
7985 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
7986 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
7987 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
7988 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
7989 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
7990}
7991
7992
7993/*
7994 * PADDQ / VPADDQ.
7995 */
7996#ifdef IEM_WITHOUT_ASSEMBLY
7997
7998IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7999{
8000 RT_NOREF(pFpuState);
8001 *puDst = *puDst + *puSrc;
8002}
8003
8004IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8005{
8006 RT_NOREF(pFpuState);
8007 RTUINT128U uSrc1 = *puDst;
8008 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8009 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8010}
8011
8012#endif
8013
8014IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8015 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8016{
8017 RT_NOREF(pExtState);
8018 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8019 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8020}
8021
8022IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8023 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8024{
8025 RT_NOREF(pExtState);
8026 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8027 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8028 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8029 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8030}
8031
8032
8033/*
8034 * PSUBB / VPSUBB
8035 */
8036#ifdef IEM_WITHOUT_ASSEMBLY
8037
8038IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8039{
8040 RT_NOREF(pFpuState);
8041 RTUINT64U uSrc1 = { *puDst };
8042 RTUINT64U uSrc2 = { *puSrc };
8043 RTUINT64U uDst;
8044 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8045 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8046 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8047 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8048 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8049 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8050 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8051 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8052 *puDst = uDst.u;
8053}
8054
8055
8056IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8057{
8058 RT_NOREF(pFpuState);
8059 RTUINT128U uSrc1 = *puDst;
8060 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8061 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8062 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8063 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8064 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8065 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8066 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8067 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8068 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8069 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8070 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8071 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8072 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8073 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8074 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8075 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8076}
8077
8078#endif
8079
8080IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8081 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8082{
8083 RT_NOREF(pExtState);
8084 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8085 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8086 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8087 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8088 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8089 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8090 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8091 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8092 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8093 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8094 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8095 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8096 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8097 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8098 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8099 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8100}
8101
8102IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8103 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8104{
8105 RT_NOREF(pExtState);
8106 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8107 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8108 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8109 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8110 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8111 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8112 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8113 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8114 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8115 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8116 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8117 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8118 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8119 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8120 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8121 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8122 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8123 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8124 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8125 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8126 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8127 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8128 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8129 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8130 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8131 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8132 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8133 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8134 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8135 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8136 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8137 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8138}
8139
8140
8141/*
8142 * PSUBW / VPSUBW
8143 */
8144#ifdef IEM_WITHOUT_ASSEMBLY
8145
8146IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8147{
8148 RT_NOREF(pFpuState);
8149 RTUINT64U uSrc1 = { *puDst };
8150 RTUINT64U uSrc2 = { *puSrc };
8151 RTUINT64U uDst;
8152 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
8153 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
8154 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
8155 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
8156 *puDst = uDst.u;
8157}
8158
8159
8160IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8161{
8162 RT_NOREF(pFpuState);
8163 RTUINT128U uSrc1 = *puDst;
8164 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
8165 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
8166 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
8167 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
8168 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
8169 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
8170 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
8171 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
8172}
8173
8174#endif
8175
8176IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8177 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8178{
8179 RT_NOREF(pExtState);
8180 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8181 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8182 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8183 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8184 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8185 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8186 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8187 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8188}
8189
8190IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8191 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8192{
8193 RT_NOREF(pExtState);
8194 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8195 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8196 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8197 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8198 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8199 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8200 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8201 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8202 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
8203 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
8204 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
8205 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
8206 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
8207 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
8208 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
8209 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
8210}
8211
8212
8213/*
8214 * PSUBD / VPSUBD.
8215 */
8216#ifdef IEM_WITHOUT_ASSEMBLY
8217
8218IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8219{
8220 RT_NOREF(pFpuState);
8221 RTUINT64U uSrc1 = { *puDst };
8222 RTUINT64U uSrc2 = { *puSrc };
8223 RTUINT64U uDst;
8224 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
8225 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
8226 *puDst = uDst.u;
8227}
8228
8229
8230IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8231{
8232 RT_NOREF(pFpuState);
8233 RTUINT128U uSrc1 = *puDst;
8234 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
8235 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
8236 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
8237 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
8238}
8239
8240#endif /* IEM_WITHOUT_ASSEMBLY */
8241
8242IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8243 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8244{
8245 RT_NOREF(pExtState);
8246 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8247 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8248 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8249 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8250}
8251
8252IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8253 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8254{
8255 RT_NOREF(pExtState);
8256 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8257 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8258 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8259 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8260 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
8261 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
8262 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
8263 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
8264}
8265
8266
8267/*
8268 * PSUBQ / VPSUBQ.
8269 */
8270#ifdef IEM_WITHOUT_ASSEMBLY
8271
8272IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8273{
8274 RT_NOREF(pFpuState);
8275 *puDst = *puDst - *puSrc;
8276}
8277
8278IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8279{
8280 RT_NOREF(pFpuState);
8281 RTUINT128U uSrc1 = *puDst;
8282 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
8283 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
8284}
8285
8286#endif
8287
8288IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8289 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8290{
8291 RT_NOREF(pExtState);
8292 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8293 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8294}
8295
8296IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8297 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8298{
8299 RT_NOREF(pExtState);
8300 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8301 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8302 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
8303 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
8304}
8305
8306
8307
8308/*
8309 * PMOVMSKB / VPMOVMSKB
8310 */
8311#ifdef IEM_WITHOUT_ASSEMBLY
8312
8313IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
8314{
8315 /* The the most signficant bit from each byte and store them in the given general purpose register. */
8316 uint64_t const uSrc = *pu64Src;
8317 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
8318 | ((uSrc >> (15-1)) & RT_BIT_64(1))
8319 | ((uSrc >> (23-2)) & RT_BIT_64(2))
8320 | ((uSrc >> (31-3)) & RT_BIT_64(3))
8321 | ((uSrc >> (39-4)) & RT_BIT_64(4))
8322 | ((uSrc >> (47-5)) & RT_BIT_64(5))
8323 | ((uSrc >> (55-6)) & RT_BIT_64(6))
8324 | ((uSrc >> (63-7)) & RT_BIT_64(7));
8325}
8326
8327
8328IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
8329{
8330 /* The the most signficant bit from each byte and store them in the given general purpose register. */
8331 uint64_t const uSrc0 = pu128Src->QWords.qw0;
8332 uint64_t const uSrc1 = pu128Src->QWords.qw1;
8333 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
8334 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
8335 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
8336 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
8337 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
8338 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
8339 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
8340 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
8341 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
8342 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
8343 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
8344 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
8345 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
8346 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
8347 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
8348 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
8349}
8350
8351#endif
8352
8353IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
8354{
8355 /* The the most signficant bit from each byte and store them in the given general purpose register. */
8356 uint64_t const uSrc0 = puSrc->QWords.qw0;
8357 uint64_t const uSrc1 = puSrc->QWords.qw1;
8358 uint64_t const uSrc2 = puSrc->QWords.qw2;
8359 uint64_t const uSrc3 = puSrc->QWords.qw3;
8360 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
8361 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
8362 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
8363 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
8364 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
8365 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
8366 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
8367 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
8368 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
8369 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
8370 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
8371 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
8372 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
8373 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
8374 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
8375 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
8376 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
8377 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
8378 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
8379 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
8380 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
8381 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
8382 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
8383 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
8384 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
8385 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
8386 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
8387 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
8388 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
8389 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
8390 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
8391 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
8392}
8393
8394
8395/*
8396 * [V]PSHUFB
8397 */
8398
8399IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8400{
8401 RTUINT64U const uSrc = { *puSrc };
8402 RTUINT64U const uDstIn = { *puDst };
8403 ASMCompilerBarrier();
8404 RTUINT64U uDstOut = { 0 };
8405 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
8406 {
8407 uint8_t idxSrc = uSrc.au8[iByte];
8408 if (!(idxSrc & 0x80))
8409 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
8410 }
8411 *puDst = uDstOut.u;
8412 RT_NOREF(pFpuState);
8413}
8414
8415
8416IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8417{
8418 RTUINT128U const uSrc = { *puSrc };
8419 RTUINT128U const uDstIn = { *puDst };
8420 ASMCompilerBarrier();
8421 puDst->au64[0] = 0;
8422 puDst->au64[1] = 0;
8423 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
8424 {
8425 uint8_t idxSrc = uSrc.au8[iByte];
8426 if (!(idxSrc & 0x80))
8427 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
8428 }
8429 RT_NOREF(pFpuState);
8430}
8431
8432
8433IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8434 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8435{
8436 RTUINT128U const uSrc1 = { *puSrc1 }; /* could be same as puDst */
8437 RTUINT128U const uSrc2 = { *puSrc2 }; /* could be same as puDst */
8438 ASMCompilerBarrier();
8439 puDst->au64[0] = 0;
8440 puDst->au64[1] = 0;
8441 for (unsigned iByte = 0; iByte < 16; iByte++)
8442 {
8443 uint8_t idxSrc = uSrc2.au8[iByte];
8444 if (!(idxSrc & 0x80))
8445 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
8446 }
8447 RT_NOREF(pExtState);
8448}
8449
8450
8451IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8452 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8453{
8454 RTUINT256U const uSrc1 = { *puSrc1 }; /* could be same as puDst */
8455 RTUINT256U const uSrc2 = { *puSrc2 }; /* could be same as puDst */
8456 ASMCompilerBarrier();
8457 puDst->au64[0] = 0;
8458 puDst->au64[1] = 0;
8459 puDst->au64[2] = 0;
8460 puDst->au64[3] = 0;
8461 for (unsigned iByte = 0; iByte < 16; iByte++)
8462 {
8463 uint8_t idxSrc = uSrc2.au8[iByte];
8464 if (!(idxSrc & 0x80))
8465 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
8466 }
8467 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
8468 {
8469 uint8_t idxSrc = uSrc2.au8[iByte];
8470 if (!(idxSrc & 0x80))
8471 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
8472 }
8473 RT_NOREF(pExtState);
8474}
8475
8476
8477/*
8478 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
8479 */
8480#ifdef IEM_WITHOUT_ASSEMBLY
8481
8482IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
8483{
8484 uint64_t const uSrc = *puSrc;
8485 ASMCompilerBarrier();
8486 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
8487 uSrc >> (((bEvil >> 2) & 3) * 16),
8488 uSrc >> (((bEvil >> 4) & 3) * 16),
8489 uSrc >> (((bEvil >> 6) & 3) * 16));
8490}
8491
8492
8493IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
8494{
8495 puDst->QWords.qw0 = puSrc->QWords.qw0;
8496 uint64_t const uSrc = puSrc->QWords.qw1;
8497 ASMCompilerBarrier();
8498 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
8499 uSrc >> (((bEvil >> 2) & 3) * 16),
8500 uSrc >> (((bEvil >> 4) & 3) * 16),
8501 uSrc >> (((bEvil >> 6) & 3) * 16));
8502}
8503
8504#endif
8505
8506IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
8507{
8508 puDst->QWords.qw0 = puSrc->QWords.qw0;
8509 uint64_t const uSrc1 = puSrc->QWords.qw1;
8510 puDst->QWords.qw2 = puSrc->QWords.qw2;
8511 uint64_t const uSrc3 = puSrc->QWords.qw3;
8512 ASMCompilerBarrier();
8513 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
8514 uSrc1 >> (((bEvil >> 2) & 3) * 16),
8515 uSrc1 >> (((bEvil >> 4) & 3) * 16),
8516 uSrc1 >> (((bEvil >> 6) & 3) * 16));
8517 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
8518 uSrc3 >> (((bEvil >> 2) & 3) * 16),
8519 uSrc3 >> (((bEvil >> 4) & 3) * 16),
8520 uSrc3 >> (((bEvil >> 6) & 3) * 16));
8521}
8522
8523#ifdef IEM_WITHOUT_ASSEMBLY
8524IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
8525{
8526 puDst->QWords.qw1 = puSrc->QWords.qw1;
8527 uint64_t const uSrc = puSrc->QWords.qw0;
8528 ASMCompilerBarrier();
8529 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
8530 uSrc >> (((bEvil >> 2) & 3) * 16),
8531 uSrc >> (((bEvil >> 4) & 3) * 16),
8532 uSrc >> (((bEvil >> 6) & 3) * 16));
8533
8534}
8535#endif
8536
8537
8538IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
8539{
8540 puDst->QWords.qw3 = puSrc->QWords.qw3;
8541 uint64_t const uSrc2 = puSrc->QWords.qw2;
8542 puDst->QWords.qw1 = puSrc->QWords.qw1;
8543 uint64_t const uSrc0 = puSrc->QWords.qw0;
8544 ASMCompilerBarrier();
8545 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
8546 uSrc0 >> (((bEvil >> 2) & 3) * 16),
8547 uSrc0 >> (((bEvil >> 4) & 3) * 16),
8548 uSrc0 >> (((bEvil >> 6) & 3) * 16));
8549 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
8550 uSrc2 >> (((bEvil >> 2) & 3) * 16),
8551 uSrc2 >> (((bEvil >> 4) & 3) * 16),
8552 uSrc2 >> (((bEvil >> 6) & 3) * 16));
8553
8554}
8555
8556
8557#ifdef IEM_WITHOUT_ASSEMBLY
8558IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
8559{
8560 RTUINT128U const uSrc = *puSrc;
8561 ASMCompilerBarrier();
8562 puDst->au32[0] = uSrc.au32[bEvil & 3];
8563 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
8564 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
8565 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
8566}
8567#endif
8568
8569
8570IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
8571{
8572 RTUINT256U const uSrc = *puSrc;
8573 ASMCompilerBarrier();
8574 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
8575 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
8576 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
8577 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
8578 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
8579 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
8580 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
8581 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
8582}
8583
8584
8585/*
8586 * PUNPCKHBW - high bytes -> words
8587 */
8588#ifdef IEM_WITHOUT_ASSEMBLY
8589
8590IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8591{
8592 RTUINT64U const uSrc2 = { *puSrc };
8593 RTUINT64U const uSrc1 = { *puDst };
8594 ASMCompilerBarrier();
8595 RTUINT64U uDstOut;
8596 uDstOut.au8[0] = uSrc1.au8[4];
8597 uDstOut.au8[1] = uSrc2.au8[4];
8598 uDstOut.au8[2] = uSrc1.au8[5];
8599 uDstOut.au8[3] = uSrc2.au8[5];
8600 uDstOut.au8[4] = uSrc1.au8[6];
8601 uDstOut.au8[5] = uSrc2.au8[6];
8602 uDstOut.au8[6] = uSrc1.au8[7];
8603 uDstOut.au8[7] = uSrc2.au8[7];
8604 *puDst = uDstOut.u;
8605}
8606
8607
8608IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8609{
8610 RTUINT128U const uSrc2 = { *puSrc };
8611 RTUINT128U const uSrc1 = { *puDst };
8612 ASMCompilerBarrier();
8613 RTUINT128U uDstOut;
8614 uDstOut.au8[ 0] = uSrc1.au8[ 8];
8615 uDstOut.au8[ 1] = uSrc2.au8[ 8];
8616 uDstOut.au8[ 2] = uSrc1.au8[ 9];
8617 uDstOut.au8[ 3] = uSrc2.au8[ 9];
8618 uDstOut.au8[ 4] = uSrc1.au8[10];
8619 uDstOut.au8[ 5] = uSrc2.au8[10];
8620 uDstOut.au8[ 6] = uSrc1.au8[11];
8621 uDstOut.au8[ 7] = uSrc2.au8[11];
8622 uDstOut.au8[ 8] = uSrc1.au8[12];
8623 uDstOut.au8[ 9] = uSrc2.au8[12];
8624 uDstOut.au8[10] = uSrc1.au8[13];
8625 uDstOut.au8[11] = uSrc2.au8[13];
8626 uDstOut.au8[12] = uSrc1.au8[14];
8627 uDstOut.au8[13] = uSrc2.au8[14];
8628 uDstOut.au8[14] = uSrc1.au8[15];
8629 uDstOut.au8[15] = uSrc2.au8[15];
8630 *puDst = uDstOut;
8631}
8632
8633#endif
8634
8635IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8636{
8637 RTUINT128U const uSrc2 = { *puSrc2 };
8638 RTUINT128U const uSrc1 = { *puSrc1 };
8639 ASMCompilerBarrier();
8640 RTUINT128U uDstOut;
8641 uDstOut.au8[ 0] = uSrc1.au8[ 8];
8642 uDstOut.au8[ 1] = uSrc2.au8[ 8];
8643 uDstOut.au8[ 2] = uSrc1.au8[ 9];
8644 uDstOut.au8[ 3] = uSrc2.au8[ 9];
8645 uDstOut.au8[ 4] = uSrc1.au8[10];
8646 uDstOut.au8[ 5] = uSrc2.au8[10];
8647 uDstOut.au8[ 6] = uSrc1.au8[11];
8648 uDstOut.au8[ 7] = uSrc2.au8[11];
8649 uDstOut.au8[ 8] = uSrc1.au8[12];
8650 uDstOut.au8[ 9] = uSrc2.au8[12];
8651 uDstOut.au8[10] = uSrc1.au8[13];
8652 uDstOut.au8[11] = uSrc2.au8[13];
8653 uDstOut.au8[12] = uSrc1.au8[14];
8654 uDstOut.au8[13] = uSrc2.au8[14];
8655 uDstOut.au8[14] = uSrc1.au8[15];
8656 uDstOut.au8[15] = uSrc2.au8[15];
8657 *puDst = uDstOut;
8658}
8659
8660
8661IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8662{
8663 RTUINT256U const uSrc2 = { *puSrc2 };
8664 RTUINT256U const uSrc1 = { *puSrc1 };
8665 ASMCompilerBarrier();
8666 RTUINT256U uDstOut;
8667 uDstOut.au8[ 0] = uSrc1.au8[ 8];
8668 uDstOut.au8[ 1] = uSrc2.au8[ 8];
8669 uDstOut.au8[ 2] = uSrc1.au8[ 9];
8670 uDstOut.au8[ 3] = uSrc2.au8[ 9];
8671 uDstOut.au8[ 4] = uSrc1.au8[10];
8672 uDstOut.au8[ 5] = uSrc2.au8[10];
8673 uDstOut.au8[ 6] = uSrc1.au8[11];
8674 uDstOut.au8[ 7] = uSrc2.au8[11];
8675 uDstOut.au8[ 8] = uSrc1.au8[12];
8676 uDstOut.au8[ 9] = uSrc2.au8[12];
8677 uDstOut.au8[10] = uSrc1.au8[13];
8678 uDstOut.au8[11] = uSrc2.au8[13];
8679 uDstOut.au8[12] = uSrc1.au8[14];
8680 uDstOut.au8[13] = uSrc2.au8[14];
8681 uDstOut.au8[14] = uSrc1.au8[15];
8682 uDstOut.au8[15] = uSrc2.au8[15];
8683 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
8684 uDstOut.au8[16] = uSrc1.au8[24];
8685 uDstOut.au8[17] = uSrc2.au8[24];
8686 uDstOut.au8[18] = uSrc1.au8[25];
8687 uDstOut.au8[19] = uSrc2.au8[25];
8688 uDstOut.au8[20] = uSrc1.au8[26];
8689 uDstOut.au8[21] = uSrc2.au8[26];
8690 uDstOut.au8[22] = uSrc1.au8[27];
8691 uDstOut.au8[23] = uSrc2.au8[27];
8692 uDstOut.au8[24] = uSrc1.au8[28];
8693 uDstOut.au8[25] = uSrc2.au8[28];
8694 uDstOut.au8[26] = uSrc1.au8[29];
8695 uDstOut.au8[27] = uSrc2.au8[29];
8696 uDstOut.au8[28] = uSrc1.au8[30];
8697 uDstOut.au8[29] = uSrc2.au8[30];
8698 uDstOut.au8[30] = uSrc1.au8[31];
8699 uDstOut.au8[31] = uSrc2.au8[31];
8700 *puDst = uDstOut;
8701}
8702
8703
8704/*
8705 * PUNPCKHBW - high words -> dwords
8706 */
8707#ifdef IEM_WITHOUT_ASSEMBLY
8708
8709IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8710{
8711 RTUINT64U const uSrc2 = { *puSrc };
8712 RTUINT64U const uSrc1 = { *puDst };
8713 ASMCompilerBarrier();
8714 RTUINT64U uDstOut;
8715 uDstOut.au16[0] = uSrc1.au16[2];
8716 uDstOut.au16[1] = uSrc2.au16[2];
8717 uDstOut.au16[2] = uSrc1.au16[3];
8718 uDstOut.au16[3] = uSrc2.au16[3];
8719 *puDst = uDstOut.u;
8720}
8721
8722
8723IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8724{
8725 RTUINT128U const uSrc2 = { *puSrc };
8726 RTUINT128U const uSrc1 = { *puDst };
8727 ASMCompilerBarrier();
8728 RTUINT128U uDstOut;
8729 uDstOut.au16[0] = uSrc1.au16[4];
8730 uDstOut.au16[1] = uSrc2.au16[4];
8731 uDstOut.au16[2] = uSrc1.au16[5];
8732 uDstOut.au16[3] = uSrc2.au16[5];
8733 uDstOut.au16[4] = uSrc1.au16[6];
8734 uDstOut.au16[5] = uSrc2.au16[6];
8735 uDstOut.au16[6] = uSrc1.au16[7];
8736 uDstOut.au16[7] = uSrc2.au16[7];
8737 *puDst = uDstOut;
8738}
8739
8740#endif
8741
8742IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8743{
8744 RTUINT128U const uSrc2 = { *puSrc2 };
8745 RTUINT128U const uSrc1 = { *puSrc1 };
8746 ASMCompilerBarrier();
8747 RTUINT128U uDstOut;
8748 uDstOut.au16[0] = uSrc1.au16[4];
8749 uDstOut.au16[1] = uSrc2.au16[4];
8750 uDstOut.au16[2] = uSrc1.au16[5];
8751 uDstOut.au16[3] = uSrc2.au16[5];
8752 uDstOut.au16[4] = uSrc1.au16[6];
8753 uDstOut.au16[5] = uSrc2.au16[6];
8754 uDstOut.au16[6] = uSrc1.au16[7];
8755 uDstOut.au16[7] = uSrc2.au16[7];
8756 *puDst = uDstOut;
8757}
8758
8759
8760IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8761{
8762 RTUINT256U const uSrc2 = { *puSrc2 };
8763 RTUINT256U const uSrc1 = { *puSrc1 };
8764 ASMCompilerBarrier();
8765 RTUINT256U uDstOut;
8766 uDstOut.au16[0] = uSrc1.au16[4];
8767 uDstOut.au16[1] = uSrc2.au16[4];
8768 uDstOut.au16[2] = uSrc1.au16[5];
8769 uDstOut.au16[3] = uSrc2.au16[5];
8770 uDstOut.au16[4] = uSrc1.au16[6];
8771 uDstOut.au16[5] = uSrc2.au16[6];
8772 uDstOut.au16[6] = uSrc1.au16[7];
8773 uDstOut.au16[7] = uSrc2.au16[7];
8774
8775 uDstOut.au16[8] = uSrc1.au16[12];
8776 uDstOut.au16[9] = uSrc2.au16[12];
8777 uDstOut.au16[10] = uSrc1.au16[13];
8778 uDstOut.au16[11] = uSrc2.au16[13];
8779 uDstOut.au16[12] = uSrc1.au16[14];
8780 uDstOut.au16[13] = uSrc2.au16[14];
8781 uDstOut.au16[14] = uSrc1.au16[15];
8782 uDstOut.au16[15] = uSrc2.au16[15];
8783 *puDst = uDstOut;
8784}
8785
8786
8787/*
8788 * PUNPCKHBW - high dwords -> qword(s)
8789 */
8790#ifdef IEM_WITHOUT_ASSEMBLY
8791
8792IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
8793{
8794 RTUINT64U const uSrc2 = { *puSrc };
8795 RTUINT64U const uSrc1 = { *puDst };
8796 ASMCompilerBarrier();
8797 RTUINT64U uDstOut;
8798 uDstOut.au32[0] = uSrc1.au32[1];
8799 uDstOut.au32[1] = uSrc2.au32[1];
8800 *puDst = uDstOut.u;
8801}
8802
8803
8804IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8805{
8806 RTUINT128U const uSrc2 = { *puSrc };
8807 RTUINT128U const uSrc1 = { *puDst };
8808 ASMCompilerBarrier();
8809 RTUINT128U uDstOut;
8810 uDstOut.au32[0] = uSrc1.au32[2];
8811 uDstOut.au32[1] = uSrc2.au32[2];
8812 uDstOut.au32[2] = uSrc1.au32[3];
8813 uDstOut.au32[3] = uSrc2.au32[3];
8814 *puDst = uDstOut;
8815}
8816
8817#endif
8818
8819IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8820{
8821 RTUINT128U const uSrc2 = { *puSrc2 };
8822 RTUINT128U const uSrc1 = { *puSrc1 };
8823 ASMCompilerBarrier();
8824 RTUINT128U uDstOut;
8825 uDstOut.au32[0] = uSrc1.au32[2];
8826 uDstOut.au32[1] = uSrc2.au32[2];
8827 uDstOut.au32[2] = uSrc1.au32[3];
8828 uDstOut.au32[3] = uSrc2.au32[3];
8829 *puDst = uDstOut;
8830}
8831
8832
8833IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8834{
8835 RTUINT256U const uSrc2 = { *puSrc2 };
8836 RTUINT256U const uSrc1 = { *puSrc1 };
8837 ASMCompilerBarrier();
8838 RTUINT256U uDstOut;
8839 uDstOut.au32[0] = uSrc1.au32[2];
8840 uDstOut.au32[1] = uSrc2.au32[2];
8841 uDstOut.au32[2] = uSrc1.au32[3];
8842 uDstOut.au32[3] = uSrc2.au32[3];
8843
8844 uDstOut.au32[4] = uSrc1.au32[6];
8845 uDstOut.au32[5] = uSrc2.au32[6];
8846 uDstOut.au32[6] = uSrc1.au32[7];
8847 uDstOut.au32[7] = uSrc2.au32[7];
8848 *puDst = uDstOut;
8849}
8850
8851
8852/*
8853 * PUNPCKHQDQ -> High qwords -> double qword(s).
8854 */
8855#ifdef IEM_WITHOUT_ASSEMBLY
8856IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8857{
8858 RTUINT128U const uSrc2 = { *puSrc };
8859 RTUINT128U const uSrc1 = { *puDst };
8860 ASMCompilerBarrier();
8861 RTUINT128U uDstOut;
8862 uDstOut.au64[0] = uSrc1.au64[1];
8863 uDstOut.au64[1] = uSrc2.au64[1];
8864 *puDst = uDstOut;
8865}
8866#endif
8867
8868
8869IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8870{
8871 RTUINT128U const uSrc2 = { *puSrc2 };
8872 RTUINT128U const uSrc1 = { *puSrc1 };
8873 ASMCompilerBarrier();
8874 RTUINT128U uDstOut;
8875 uDstOut.au64[0] = uSrc1.au64[1];
8876 uDstOut.au64[1] = uSrc2.au64[1];
8877 *puDst = uDstOut;
8878}
8879
8880
8881IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8882{
8883 RTUINT256U const uSrc2 = { *puSrc2 };
8884 RTUINT256U const uSrc1 = { *puSrc1 };
8885 ASMCompilerBarrier();
8886 RTUINT256U uDstOut;
8887 uDstOut.au64[0] = uSrc1.au64[1];
8888 uDstOut.au64[1] = uSrc2.au64[1];
8889
8890 uDstOut.au64[2] = uSrc1.au64[3];
8891 uDstOut.au64[3] = uSrc2.au64[3];
8892 *puDst = uDstOut;
8893}
8894
8895
8896/*
8897 * PUNPCKLBW - low bytes -> words
8898 */
8899#ifdef IEM_WITHOUT_ASSEMBLY
8900
8901IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8902{
8903 RTUINT64U const uSrc2 = { *puSrc };
8904 RTUINT64U const uSrc1 = { *puDst };
8905 ASMCompilerBarrier();
8906 RTUINT64U uDstOut;
8907 uDstOut.au8[0] = uSrc1.au8[0];
8908 uDstOut.au8[1] = uSrc2.au8[0];
8909 uDstOut.au8[2] = uSrc1.au8[1];
8910 uDstOut.au8[3] = uSrc2.au8[1];
8911 uDstOut.au8[4] = uSrc1.au8[2];
8912 uDstOut.au8[5] = uSrc2.au8[2];
8913 uDstOut.au8[6] = uSrc1.au8[3];
8914 uDstOut.au8[7] = uSrc2.au8[3];
8915 *puDst = uDstOut.u;
8916}
8917
8918
8919IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8920{
8921 RTUINT128U const uSrc2 = { *puSrc };
8922 RTUINT128U const uSrc1 = { *puDst };
8923 ASMCompilerBarrier();
8924 RTUINT128U uDstOut;
8925 uDstOut.au8[ 0] = uSrc1.au8[0];
8926 uDstOut.au8[ 1] = uSrc2.au8[0];
8927 uDstOut.au8[ 2] = uSrc1.au8[1];
8928 uDstOut.au8[ 3] = uSrc2.au8[1];
8929 uDstOut.au8[ 4] = uSrc1.au8[2];
8930 uDstOut.au8[ 5] = uSrc2.au8[2];
8931 uDstOut.au8[ 6] = uSrc1.au8[3];
8932 uDstOut.au8[ 7] = uSrc2.au8[3];
8933 uDstOut.au8[ 8] = uSrc1.au8[4];
8934 uDstOut.au8[ 9] = uSrc2.au8[4];
8935 uDstOut.au8[10] = uSrc1.au8[5];
8936 uDstOut.au8[11] = uSrc2.au8[5];
8937 uDstOut.au8[12] = uSrc1.au8[6];
8938 uDstOut.au8[13] = uSrc2.au8[6];
8939 uDstOut.au8[14] = uSrc1.au8[7];
8940 uDstOut.au8[15] = uSrc2.au8[7];
8941 *puDst = uDstOut;
8942}
8943
8944#endif
8945
8946IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8947{
8948 RTUINT128U const uSrc2 = { *puSrc2 };
8949 RTUINT128U const uSrc1 = { *puSrc1 };
8950 ASMCompilerBarrier();
8951 RTUINT128U uDstOut;
8952 uDstOut.au8[ 0] = uSrc1.au8[0];
8953 uDstOut.au8[ 1] = uSrc2.au8[0];
8954 uDstOut.au8[ 2] = uSrc1.au8[1];
8955 uDstOut.au8[ 3] = uSrc2.au8[1];
8956 uDstOut.au8[ 4] = uSrc1.au8[2];
8957 uDstOut.au8[ 5] = uSrc2.au8[2];
8958 uDstOut.au8[ 6] = uSrc1.au8[3];
8959 uDstOut.au8[ 7] = uSrc2.au8[3];
8960 uDstOut.au8[ 8] = uSrc1.au8[4];
8961 uDstOut.au8[ 9] = uSrc2.au8[4];
8962 uDstOut.au8[10] = uSrc1.au8[5];
8963 uDstOut.au8[11] = uSrc2.au8[5];
8964 uDstOut.au8[12] = uSrc1.au8[6];
8965 uDstOut.au8[13] = uSrc2.au8[6];
8966 uDstOut.au8[14] = uSrc1.au8[7];
8967 uDstOut.au8[15] = uSrc2.au8[7];
8968 *puDst = uDstOut;
8969}
8970
8971
8972IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8973{
8974 RTUINT256U const uSrc2 = { *puSrc2 };
8975 RTUINT256U const uSrc1 = { *puSrc1 };
8976 ASMCompilerBarrier();
8977 RTUINT256U uDstOut;
8978 uDstOut.au8[ 0] = uSrc1.au8[0];
8979 uDstOut.au8[ 1] = uSrc2.au8[0];
8980 uDstOut.au8[ 2] = uSrc1.au8[1];
8981 uDstOut.au8[ 3] = uSrc2.au8[1];
8982 uDstOut.au8[ 4] = uSrc1.au8[2];
8983 uDstOut.au8[ 5] = uSrc2.au8[2];
8984 uDstOut.au8[ 6] = uSrc1.au8[3];
8985 uDstOut.au8[ 7] = uSrc2.au8[3];
8986 uDstOut.au8[ 8] = uSrc1.au8[4];
8987 uDstOut.au8[ 9] = uSrc2.au8[4];
8988 uDstOut.au8[10] = uSrc1.au8[5];
8989 uDstOut.au8[11] = uSrc2.au8[5];
8990 uDstOut.au8[12] = uSrc1.au8[6];
8991 uDstOut.au8[13] = uSrc2.au8[6];
8992 uDstOut.au8[14] = uSrc1.au8[7];
8993 uDstOut.au8[15] = uSrc2.au8[7];
8994 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
8995 uDstOut.au8[16] = uSrc1.au8[16];
8996 uDstOut.au8[17] = uSrc2.au8[16];
8997 uDstOut.au8[18] = uSrc1.au8[17];
8998 uDstOut.au8[19] = uSrc2.au8[17];
8999 uDstOut.au8[20] = uSrc1.au8[18];
9000 uDstOut.au8[21] = uSrc2.au8[18];
9001 uDstOut.au8[22] = uSrc1.au8[19];
9002 uDstOut.au8[23] = uSrc2.au8[19];
9003 uDstOut.au8[24] = uSrc1.au8[20];
9004 uDstOut.au8[25] = uSrc2.au8[20];
9005 uDstOut.au8[26] = uSrc1.au8[21];
9006 uDstOut.au8[27] = uSrc2.au8[21];
9007 uDstOut.au8[28] = uSrc1.au8[22];
9008 uDstOut.au8[29] = uSrc2.au8[22];
9009 uDstOut.au8[30] = uSrc1.au8[23];
9010 uDstOut.au8[31] = uSrc2.au8[23];
9011 *puDst = uDstOut;
9012}
9013
9014
9015/*
9016 * PUNPCKLBW - low words -> dwords
9017 */
9018#ifdef IEM_WITHOUT_ASSEMBLY
9019
9020IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
9021{
9022 RTUINT64U const uSrc2 = { *puSrc };
9023 RTUINT64U const uSrc1 = { *puDst };
9024 ASMCompilerBarrier();
9025 RTUINT64U uDstOut;
9026 uDstOut.au16[0] = uSrc1.au16[0];
9027 uDstOut.au16[1] = uSrc2.au16[0];
9028 uDstOut.au16[2] = uSrc1.au16[1];
9029 uDstOut.au16[3] = uSrc2.au16[1];
9030 *puDst = uDstOut.u;
9031}
9032
9033
9034IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9035{
9036 RTUINT128U const uSrc2 = { *puSrc };
9037 RTUINT128U const uSrc1 = { *puDst };
9038 ASMCompilerBarrier();
9039 RTUINT128U uDstOut;
9040 uDstOut.au16[0] = uSrc1.au16[0];
9041 uDstOut.au16[1] = uSrc2.au16[0];
9042 uDstOut.au16[2] = uSrc1.au16[1];
9043 uDstOut.au16[3] = uSrc2.au16[1];
9044 uDstOut.au16[4] = uSrc1.au16[2];
9045 uDstOut.au16[5] = uSrc2.au16[2];
9046 uDstOut.au16[6] = uSrc1.au16[3];
9047 uDstOut.au16[7] = uSrc2.au16[3];
9048 *puDst = uDstOut;
9049}
9050
9051#endif
9052
9053IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9054{
9055 RTUINT128U const uSrc2 = { *puSrc2 };
9056 RTUINT128U const uSrc1 = { *puSrc1 };
9057 ASMCompilerBarrier();
9058 RTUINT128U uDstOut;
9059 uDstOut.au16[0] = uSrc1.au16[0];
9060 uDstOut.au16[1] = uSrc2.au16[0];
9061 uDstOut.au16[2] = uSrc1.au16[1];
9062 uDstOut.au16[3] = uSrc2.au16[1];
9063 uDstOut.au16[4] = uSrc1.au16[2];
9064 uDstOut.au16[5] = uSrc2.au16[2];
9065 uDstOut.au16[6] = uSrc1.au16[3];
9066 uDstOut.au16[7] = uSrc2.au16[3];
9067 *puDst = uDstOut;
9068}
9069
9070
9071IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9072{
9073 RTUINT256U const uSrc2 = { *puSrc2 };
9074 RTUINT256U const uSrc1 = { *puSrc1 };
9075 ASMCompilerBarrier();
9076 RTUINT256U uDstOut;
9077 uDstOut.au16[0] = uSrc1.au16[0];
9078 uDstOut.au16[1] = uSrc2.au16[0];
9079 uDstOut.au16[2] = uSrc1.au16[1];
9080 uDstOut.au16[3] = uSrc2.au16[1];
9081 uDstOut.au16[4] = uSrc1.au16[2];
9082 uDstOut.au16[5] = uSrc2.au16[2];
9083 uDstOut.au16[6] = uSrc1.au16[3];
9084 uDstOut.au16[7] = uSrc2.au16[3];
9085
9086 uDstOut.au16[8] = uSrc1.au16[8];
9087 uDstOut.au16[9] = uSrc2.au16[8];
9088 uDstOut.au16[10] = uSrc1.au16[9];
9089 uDstOut.au16[11] = uSrc2.au16[9];
9090 uDstOut.au16[12] = uSrc1.au16[10];
9091 uDstOut.au16[13] = uSrc2.au16[10];
9092 uDstOut.au16[14] = uSrc1.au16[11];
9093 uDstOut.au16[15] = uSrc2.au16[11];
9094 *puDst = uDstOut;
9095}
9096
9097
9098/*
9099 * PUNPCKLBW - low dwords -> qword(s)
9100 */
9101#ifdef IEM_WITHOUT_ASSEMBLY
9102
9103IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9104{
9105 RTUINT64U const uSrc2 = { *puSrc };
9106 RTUINT64U const uSrc1 = { *puDst };
9107 ASMCompilerBarrier();
9108 RTUINT64U uDstOut;
9109 uDstOut.au32[0] = uSrc1.au32[0];
9110 uDstOut.au32[1] = uSrc2.au32[0];
9111 *puDst = uDstOut.u;
9112}
9113
9114
9115IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9116{
9117 RTUINT128U const uSrc2 = { *puSrc };
9118 RTUINT128U const uSrc1 = { *puDst };
9119 ASMCompilerBarrier();
9120 RTUINT128U uDstOut;
9121 uDstOut.au32[0] = uSrc1.au32[0];
9122 uDstOut.au32[1] = uSrc2.au32[0];
9123 uDstOut.au32[2] = uSrc1.au32[1];
9124 uDstOut.au32[3] = uSrc2.au32[1];
9125 *puDst = uDstOut;
9126}
9127
9128#endif
9129
9130IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9131{
9132 RTUINT128U const uSrc2 = { *puSrc2 };
9133 RTUINT128U const uSrc1 = { *puSrc1 };
9134 ASMCompilerBarrier();
9135 RTUINT128U uDstOut;
9136 uDstOut.au32[0] = uSrc1.au32[0];
9137 uDstOut.au32[1] = uSrc2.au32[0];
9138 uDstOut.au32[2] = uSrc1.au32[1];
9139 uDstOut.au32[3] = uSrc2.au32[1];
9140 *puDst = uDstOut;
9141}
9142
9143
9144IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9145{
9146 RTUINT256U const uSrc2 = { *puSrc2 };
9147 RTUINT256U const uSrc1 = { *puSrc1 };
9148 ASMCompilerBarrier();
9149 RTUINT256U uDstOut;
9150 uDstOut.au32[0] = uSrc1.au32[0];
9151 uDstOut.au32[1] = uSrc2.au32[0];
9152 uDstOut.au32[2] = uSrc1.au32[1];
9153 uDstOut.au32[3] = uSrc2.au32[1];
9154
9155 uDstOut.au32[4] = uSrc1.au32[4];
9156 uDstOut.au32[5] = uSrc2.au32[4];
9157 uDstOut.au32[6] = uSrc1.au32[5];
9158 uDstOut.au32[7] = uSrc2.au32[5];
9159 *puDst = uDstOut;
9160}
9161
9162
9163/*
9164 * PUNPCKLQDQ -> Low qwords -> double qword(s).
9165 */
9166#ifdef IEM_WITHOUT_ASSEMBLY
9167IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9168{
9169 RTUINT128U const uSrc2 = { *puSrc };
9170 RTUINT128U const uSrc1 = { *puDst };
9171 ASMCompilerBarrier();
9172 RTUINT128U uDstOut;
9173 uDstOut.au64[0] = uSrc1.au64[0];
9174 uDstOut.au64[1] = uSrc2.au64[0];
9175 *puDst = uDstOut;
9176}
9177#endif
9178
9179
9180IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9181{
9182 RTUINT128U const uSrc2 = { *puSrc2 };
9183 RTUINT128U const uSrc1 = { *puSrc1 };
9184 ASMCompilerBarrier();
9185 RTUINT128U uDstOut;
9186 uDstOut.au64[0] = uSrc1.au64[0];
9187 uDstOut.au64[1] = uSrc2.au64[0];
9188 *puDst = uDstOut;
9189}
9190
9191
9192IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9193{
9194 RTUINT256U const uSrc2 = { *puSrc2 };
9195 RTUINT256U const uSrc1 = { *puSrc1 };
9196 ASMCompilerBarrier();
9197 RTUINT256U uDstOut;
9198 uDstOut.au64[0] = uSrc1.au64[0];
9199 uDstOut.au64[1] = uSrc2.au64[0];
9200
9201 uDstOut.au64[2] = uSrc1.au64[2];
9202 uDstOut.au64[3] = uSrc2.au64[2];
9203 *puDst = uDstOut;
9204}
9205
9206
9207/*
9208 * PACKSSWB - signed words -> signed bytes
9209 */
9210#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
9211 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
9212 ? (uint8_t)(a_iWord) \
9213 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
9214
9215#ifdef IEM_WITHOUT_ASSEMBLY
9216
9217IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9218{
9219 RTUINT64U const uSrc2 = { *puSrc };
9220 RTUINT64U const uSrc1 = { *puDst };
9221 ASMCompilerBarrier();
9222 RTUINT64U uDstOut;
9223 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
9224 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
9225 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
9226 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
9227 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
9228 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
9229 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
9230 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
9231 *puDst = uDstOut.u;
9232}
9233
9234
9235IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9236{
9237 RTUINT128U const uSrc2 = { *puSrc };
9238 RTUINT128U const uSrc1 = { *puDst };
9239 ASMCompilerBarrier();
9240 RTUINT128U uDstOut;
9241 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
9242 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
9243 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
9244 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
9245 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
9246 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
9247 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
9248 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
9249 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
9250 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
9251 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
9252 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
9253 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
9254 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
9255 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
9256 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
9257 *puDst = uDstOut;
9258}
9259
9260#endif
9261
9262IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9263{
9264 RTUINT128U const uSrc2 = { *puSrc2 };
9265 RTUINT128U const uSrc1 = { *puSrc1 };
9266 ASMCompilerBarrier();
9267 RTUINT128U uDstOut;
9268 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
9269 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
9270 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
9271 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
9272 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
9273 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
9274 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
9275 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
9276 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
9277 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
9278 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
9279 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
9280 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
9281 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
9282 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
9283 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
9284 *puDst = uDstOut;
9285}
9286
9287
9288IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9289{
9290 RTUINT256U const uSrc2 = { *puSrc2 };
9291 RTUINT256U const uSrc1 = { *puSrc1 };
9292 ASMCompilerBarrier();
9293 RTUINT256U uDstOut;
9294 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
9295 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
9296 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
9297 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
9298 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
9299 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
9300 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
9301 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
9302 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
9303 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
9304 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
9305 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
9306 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
9307 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
9308 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
9309 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
9310
9311 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
9312 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
9313 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
9314 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
9315 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
9316 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
9317 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
9318 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
9319 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
9320 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
9321 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
9322 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
9323 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
9324 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
9325 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
9326 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
9327 *puDst = uDstOut;
9328}
9329
9330
9331/*
9332 * PACKUSWB - signed words -> unsigned bytes
9333 */
9334#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
9335 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
9336 ? (uint8_t)(a_iWord) \
9337 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
9338
9339#ifdef IEM_WITHOUT_ASSEMBLY
9340
9341IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9342{
9343 RTUINT64U const uSrc2 = { *puSrc };
9344 RTUINT64U const uSrc1 = { *puDst };
9345 ASMCompilerBarrier();
9346 RTUINT64U uDstOut;
9347 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
9348 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
9349 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
9350 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
9351 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
9352 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
9353 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
9354 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
9355 *puDst = uDstOut.u;
9356}
9357
9358
9359IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9360{
9361 RTUINT128U const uSrc2 = { *puSrc };
9362 RTUINT128U const uSrc1 = { *puDst };
9363 ASMCompilerBarrier();
9364 RTUINT128U uDstOut;
9365 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
9366 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
9367 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
9368 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
9369 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
9370 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
9371 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
9372 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
9373 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
9374 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
9375 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
9376 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
9377 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
9378 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
9379 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
9380 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
9381 *puDst = uDstOut;
9382}
9383
9384#endif
9385
9386IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9387{
9388 RTUINT128U const uSrc2 = { *puSrc2 };
9389 RTUINT128U const uSrc1 = { *puSrc1 };
9390 ASMCompilerBarrier();
9391 RTUINT128U uDstOut;
9392 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
9393 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
9394 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
9395 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
9396 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
9397 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
9398 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
9399 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
9400 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
9401 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
9402 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
9403 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
9404 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
9405 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
9406 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
9407 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
9408 *puDst = uDstOut;
9409}
9410
9411
9412IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9413{
9414 RTUINT256U const uSrc2 = { *puSrc2 };
9415 RTUINT256U const uSrc1 = { *puSrc1 };
9416 ASMCompilerBarrier();
9417 RTUINT256U uDstOut;
9418 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
9419 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
9420 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
9421 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
9422 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
9423 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
9424 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
9425 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
9426 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
9427 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
9428 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
9429 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
9430 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
9431 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
9432 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
9433 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
9434
9435 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
9436 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
9437 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
9438 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
9439 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
9440 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
9441 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
9442 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
9443 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
9444 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
9445 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
9446 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
9447 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
9448 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
9449 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
9450 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
9451 *puDst = uDstOut;
9452}
9453
9454
9455/*
9456 * PACKSSDW - signed dwords -> signed words
9457 */
9458#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
9459 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
9460 ? (uint16_t)(a_iDword) \
9461 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
9462
9463#ifdef IEM_WITHOUT_ASSEMBLY
9464
9465IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9466{
9467 RTUINT64U const uSrc2 = { *puSrc };
9468 RTUINT64U const uSrc1 = { *puDst };
9469 ASMCompilerBarrier();
9470 RTUINT64U uDstOut;
9471 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
9472 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
9473 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
9474 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
9475 *puDst = uDstOut.u;
9476}
9477
9478
9479IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9480{
9481 RTUINT128U const uSrc2 = { *puSrc };
9482 RTUINT128U const uSrc1 = { *puDst };
9483 ASMCompilerBarrier();
9484 RTUINT128U uDstOut;
9485 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
9486 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
9487 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
9488 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
9489 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
9490 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
9491 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
9492 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
9493 *puDst = uDstOut;
9494}
9495
9496#endif
9497
9498IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9499{
9500 RTUINT128U const uSrc2 = { *puSrc2 };
9501 RTUINT128U const uSrc1 = { *puSrc1 };
9502 ASMCompilerBarrier();
9503 RTUINT128U uDstOut;
9504 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
9505 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
9506 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
9507 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
9508 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
9509 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
9510 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
9511 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
9512 *puDst = uDstOut;
9513}
9514
9515
9516IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9517{
9518 RTUINT256U const uSrc2 = { *puSrc2 };
9519 RTUINT256U const uSrc1 = { *puSrc1 };
9520 ASMCompilerBarrier();
9521 RTUINT256U uDstOut;
9522 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
9523 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
9524 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
9525 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
9526 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
9527 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
9528 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
9529 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
9530
9531 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
9532 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
9533 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
9534 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
9535 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
9536 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
9537 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
9538 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
9539 *puDst = uDstOut;
9540}
9541
9542
9543/*
9544 * PACKUSDW - signed dwords -> unsigned words
9545 */
9546#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
9547 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
9548 ? (uint16_t)(a_iDword) \
9549 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
9550
9551#ifdef IEM_WITHOUT_ASSEMBLY
9552IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9553{
9554 RTUINT128U const uSrc2 = { *puSrc };
9555 RTUINT128U const uSrc1 = { *puDst };
9556 ASMCompilerBarrier();
9557 RTUINT128U uDstOut;
9558 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
9559 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
9560 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
9561 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
9562 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
9563 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
9564 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
9565 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
9566 *puDst = uDstOut;
9567}
9568#endif
9569
9570IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9571{
9572 RTUINT128U const uSrc2 = { *puSrc2 };
9573 RTUINT128U const uSrc1 = { *puSrc1 };
9574 ASMCompilerBarrier();
9575 RTUINT128U uDstOut;
9576 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
9577 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
9578 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
9579 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
9580 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
9581 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
9582 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
9583 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
9584 *puDst = uDstOut;
9585}
9586
9587
9588IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9589{
9590 RTUINT256U const uSrc2 = { *puSrc2 };
9591 RTUINT256U const uSrc1 = { *puSrc1 };
9592 ASMCompilerBarrier();
9593 RTUINT256U uDstOut;
9594 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
9595 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
9596 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
9597 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
9598 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
9599 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
9600 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
9601 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
9602
9603 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
9604 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
9605 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
9606 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
9607 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
9608 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
9609 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
9610 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
9611 *puDst = uDstOut;
9612}
9613
9614
9615/*
9616 * CRC32 (SEE 4.2).
9617 */
9618
9619IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
9620{
9621 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
9622}
9623
9624
9625IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
9626{
9627 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
9628}
9629
9630IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
9631{
9632 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
9633}
9634
9635IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
9636{
9637 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
9638}
9639
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette