VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 95953

Last change on this file since 95953 was 95951, checked in by vboxsync, 3 years ago

IEM: Added PMAXUB/PMINUB and corrected some of the shift variants.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 408.2 KB
Line 
1/* $Id: IEMAllAImplC.cpp 95951 2022-07-29 15:57:41Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*********************************************************************************************************************************
20* Header Files *
21*********************************************************************************************************************************/
22#include "IEMInternal.h"
23#include <VBox/vmm/vmcc.h>
24#include <iprt/errcore.h>
25#include <iprt/x86.h>
26#include <iprt/uint128.h>
27#include <iprt/uint256.h>
28#include <iprt/crc.h>
29
30RT_C_DECLS_BEGIN
31#include <softfloat.h>
32RT_C_DECLS_END
33
34
35/*********************************************************************************************************************************
36* Defined Constants And Macros *
37*********************************************************************************************************************************/
38/** @def IEM_WITHOUT_ASSEMBLY
39 * Enables all the code in this file.
40 */
41#if !defined(IEM_WITHOUT_ASSEMBLY)
42# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
43# define IEM_WITHOUT_ASSEMBLY
44# endif
45#endif
46/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
47#ifdef IEM_WITH_ASSEMBLY
48# undef IEM_WITHOUT_ASSEMBLY
49#endif
50
51/**
52 * Calculates the signed flag value given a result and it's bit width.
53 *
54 * The signed flag (SF) is a duplication of the most significant bit in the
55 * result.
56 *
57 * @returns X86_EFL_SF or 0.
58 * @param a_uResult Unsigned result value.
59 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
60 */
61#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
62 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
63
64/**
65 * Calculates the zero flag value given a result.
66 *
67 * The zero flag (ZF) indicates whether the result is zero or not.
68 *
69 * @returns X86_EFL_ZF or 0.
70 * @param a_uResult Unsigned result value.
71 */
72#define X86_EFL_CALC_ZF(a_uResult) \
73 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
74
75/**
76 * Extracts the OF flag from a OF calculation result.
77 *
78 * These are typically used by concating with a bitcount. The problem is that
79 * 8-bit values needs shifting in the other direction than the others.
80 */
81#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
82#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
83#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
84#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
85
86/**
87 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
88 *
89 * @returns Status bits.
90 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
91 * @param a_uResult Unsigned result value.
92 * @param a_uSrc The source value (for AF calc).
93 * @param a_uDst The original destination value (for AF calc).
94 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
95 * @param a_CfExpr Bool expression for the carry flag (CF).
96 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
97 */
98#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
99 do { \
100 uint32_t fEflTmp = *(a_pfEFlags); \
101 fEflTmp &= ~X86_EFL_STATUS_BITS; \
102 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
103 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
104 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
105 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
106 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
107 \
108 /* Overflow during ADDition happens when both inputs have the same signed \
109 bit value and the result has a different sign bit value. \
110 \
111 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
112 follows that for SUBtraction the signed bit value must differ between \
113 the two inputs and the result's signed bit diff from the first input. \
114 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
115 \
116 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
117 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
118 & RT_BIT_64(a_cBitsWidth - 1)) \
119 & ((a_uResult) ^ (a_uDst)) ); \
120 *(a_pfEFlags) = fEflTmp; \
121 } while (0)
122
123/**
124 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
125 *
126 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
127 * undefined. We do not set AF, as that seems to make the most sense (which
128 * probably makes it the most wrong in real life).
129 *
130 * @returns Status bits.
131 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
132 * @param a_uResult Unsigned result value.
133 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
134 * @param a_fExtra Additional bits to set.
135 */
136#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
137 do { \
138 uint32_t fEflTmp = *(a_pfEFlags); \
139 fEflTmp &= ~X86_EFL_STATUS_BITS; \
140 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
141 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
142 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
143 fEflTmp |= (a_fExtra); \
144 *(a_pfEFlags) = fEflTmp; \
145 } while (0)
146
147
148/*********************************************************************************************************************************
149* Global Variables *
150*********************************************************************************************************************************/
151/**
152 * Parity calculation table.
153 *
154 * This is also used by iemAllAImpl.asm.
155 *
156 * The generator code:
157 * @code
158 * #include <stdio.h>
159 *
160 * int main()
161 * {
162 * unsigned b;
163 * for (b = 0; b < 256; b++)
164 * {
165 * int cOnes = ( b & 1)
166 * + ((b >> 1) & 1)
167 * + ((b >> 2) & 1)
168 * + ((b >> 3) & 1)
169 * + ((b >> 4) & 1)
170 * + ((b >> 5) & 1)
171 * + ((b >> 6) & 1)
172 * + ((b >> 7) & 1);
173 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
174 * b,
175 * (b >> 7) & 1,
176 * (b >> 6) & 1,
177 * (b >> 5) & 1,
178 * (b >> 4) & 1,
179 * (b >> 3) & 1,
180 * (b >> 2) & 1,
181 * (b >> 1) & 1,
182 * b & 1,
183 * cOnes & 1 ? "0" : "X86_EFL_PF");
184 * }
185 * return 0;
186 * }
187 * @endcode
188 */
189uint8_t const g_afParity[256] =
190{
191 /* 0000 = 00000000b */ X86_EFL_PF,
192 /* 0x01 = 00000001b */ 0,
193 /* 0x02 = 00000010b */ 0,
194 /* 0x03 = 00000011b */ X86_EFL_PF,
195 /* 0x04 = 00000100b */ 0,
196 /* 0x05 = 00000101b */ X86_EFL_PF,
197 /* 0x06 = 00000110b */ X86_EFL_PF,
198 /* 0x07 = 00000111b */ 0,
199 /* 0x08 = 00001000b */ 0,
200 /* 0x09 = 00001001b */ X86_EFL_PF,
201 /* 0x0a = 00001010b */ X86_EFL_PF,
202 /* 0x0b = 00001011b */ 0,
203 /* 0x0c = 00001100b */ X86_EFL_PF,
204 /* 0x0d = 00001101b */ 0,
205 /* 0x0e = 00001110b */ 0,
206 /* 0x0f = 00001111b */ X86_EFL_PF,
207 /* 0x10 = 00010000b */ 0,
208 /* 0x11 = 00010001b */ X86_EFL_PF,
209 /* 0x12 = 00010010b */ X86_EFL_PF,
210 /* 0x13 = 00010011b */ 0,
211 /* 0x14 = 00010100b */ X86_EFL_PF,
212 /* 0x15 = 00010101b */ 0,
213 /* 0x16 = 00010110b */ 0,
214 /* 0x17 = 00010111b */ X86_EFL_PF,
215 /* 0x18 = 00011000b */ X86_EFL_PF,
216 /* 0x19 = 00011001b */ 0,
217 /* 0x1a = 00011010b */ 0,
218 /* 0x1b = 00011011b */ X86_EFL_PF,
219 /* 0x1c = 00011100b */ 0,
220 /* 0x1d = 00011101b */ X86_EFL_PF,
221 /* 0x1e = 00011110b */ X86_EFL_PF,
222 /* 0x1f = 00011111b */ 0,
223 /* 0x20 = 00100000b */ 0,
224 /* 0x21 = 00100001b */ X86_EFL_PF,
225 /* 0x22 = 00100010b */ X86_EFL_PF,
226 /* 0x23 = 00100011b */ 0,
227 /* 0x24 = 00100100b */ X86_EFL_PF,
228 /* 0x25 = 00100101b */ 0,
229 /* 0x26 = 00100110b */ 0,
230 /* 0x27 = 00100111b */ X86_EFL_PF,
231 /* 0x28 = 00101000b */ X86_EFL_PF,
232 /* 0x29 = 00101001b */ 0,
233 /* 0x2a = 00101010b */ 0,
234 /* 0x2b = 00101011b */ X86_EFL_PF,
235 /* 0x2c = 00101100b */ 0,
236 /* 0x2d = 00101101b */ X86_EFL_PF,
237 /* 0x2e = 00101110b */ X86_EFL_PF,
238 /* 0x2f = 00101111b */ 0,
239 /* 0x30 = 00110000b */ X86_EFL_PF,
240 /* 0x31 = 00110001b */ 0,
241 /* 0x32 = 00110010b */ 0,
242 /* 0x33 = 00110011b */ X86_EFL_PF,
243 /* 0x34 = 00110100b */ 0,
244 /* 0x35 = 00110101b */ X86_EFL_PF,
245 /* 0x36 = 00110110b */ X86_EFL_PF,
246 /* 0x37 = 00110111b */ 0,
247 /* 0x38 = 00111000b */ 0,
248 /* 0x39 = 00111001b */ X86_EFL_PF,
249 /* 0x3a = 00111010b */ X86_EFL_PF,
250 /* 0x3b = 00111011b */ 0,
251 /* 0x3c = 00111100b */ X86_EFL_PF,
252 /* 0x3d = 00111101b */ 0,
253 /* 0x3e = 00111110b */ 0,
254 /* 0x3f = 00111111b */ X86_EFL_PF,
255 /* 0x40 = 01000000b */ 0,
256 /* 0x41 = 01000001b */ X86_EFL_PF,
257 /* 0x42 = 01000010b */ X86_EFL_PF,
258 /* 0x43 = 01000011b */ 0,
259 /* 0x44 = 01000100b */ X86_EFL_PF,
260 /* 0x45 = 01000101b */ 0,
261 /* 0x46 = 01000110b */ 0,
262 /* 0x47 = 01000111b */ X86_EFL_PF,
263 /* 0x48 = 01001000b */ X86_EFL_PF,
264 /* 0x49 = 01001001b */ 0,
265 /* 0x4a = 01001010b */ 0,
266 /* 0x4b = 01001011b */ X86_EFL_PF,
267 /* 0x4c = 01001100b */ 0,
268 /* 0x4d = 01001101b */ X86_EFL_PF,
269 /* 0x4e = 01001110b */ X86_EFL_PF,
270 /* 0x4f = 01001111b */ 0,
271 /* 0x50 = 01010000b */ X86_EFL_PF,
272 /* 0x51 = 01010001b */ 0,
273 /* 0x52 = 01010010b */ 0,
274 /* 0x53 = 01010011b */ X86_EFL_PF,
275 /* 0x54 = 01010100b */ 0,
276 /* 0x55 = 01010101b */ X86_EFL_PF,
277 /* 0x56 = 01010110b */ X86_EFL_PF,
278 /* 0x57 = 01010111b */ 0,
279 /* 0x58 = 01011000b */ 0,
280 /* 0x59 = 01011001b */ X86_EFL_PF,
281 /* 0x5a = 01011010b */ X86_EFL_PF,
282 /* 0x5b = 01011011b */ 0,
283 /* 0x5c = 01011100b */ X86_EFL_PF,
284 /* 0x5d = 01011101b */ 0,
285 /* 0x5e = 01011110b */ 0,
286 /* 0x5f = 01011111b */ X86_EFL_PF,
287 /* 0x60 = 01100000b */ X86_EFL_PF,
288 /* 0x61 = 01100001b */ 0,
289 /* 0x62 = 01100010b */ 0,
290 /* 0x63 = 01100011b */ X86_EFL_PF,
291 /* 0x64 = 01100100b */ 0,
292 /* 0x65 = 01100101b */ X86_EFL_PF,
293 /* 0x66 = 01100110b */ X86_EFL_PF,
294 /* 0x67 = 01100111b */ 0,
295 /* 0x68 = 01101000b */ 0,
296 /* 0x69 = 01101001b */ X86_EFL_PF,
297 /* 0x6a = 01101010b */ X86_EFL_PF,
298 /* 0x6b = 01101011b */ 0,
299 /* 0x6c = 01101100b */ X86_EFL_PF,
300 /* 0x6d = 01101101b */ 0,
301 /* 0x6e = 01101110b */ 0,
302 /* 0x6f = 01101111b */ X86_EFL_PF,
303 /* 0x70 = 01110000b */ 0,
304 /* 0x71 = 01110001b */ X86_EFL_PF,
305 /* 0x72 = 01110010b */ X86_EFL_PF,
306 /* 0x73 = 01110011b */ 0,
307 /* 0x74 = 01110100b */ X86_EFL_PF,
308 /* 0x75 = 01110101b */ 0,
309 /* 0x76 = 01110110b */ 0,
310 /* 0x77 = 01110111b */ X86_EFL_PF,
311 /* 0x78 = 01111000b */ X86_EFL_PF,
312 /* 0x79 = 01111001b */ 0,
313 /* 0x7a = 01111010b */ 0,
314 /* 0x7b = 01111011b */ X86_EFL_PF,
315 /* 0x7c = 01111100b */ 0,
316 /* 0x7d = 01111101b */ X86_EFL_PF,
317 /* 0x7e = 01111110b */ X86_EFL_PF,
318 /* 0x7f = 01111111b */ 0,
319 /* 0x80 = 10000000b */ 0,
320 /* 0x81 = 10000001b */ X86_EFL_PF,
321 /* 0x82 = 10000010b */ X86_EFL_PF,
322 /* 0x83 = 10000011b */ 0,
323 /* 0x84 = 10000100b */ X86_EFL_PF,
324 /* 0x85 = 10000101b */ 0,
325 /* 0x86 = 10000110b */ 0,
326 /* 0x87 = 10000111b */ X86_EFL_PF,
327 /* 0x88 = 10001000b */ X86_EFL_PF,
328 /* 0x89 = 10001001b */ 0,
329 /* 0x8a = 10001010b */ 0,
330 /* 0x8b = 10001011b */ X86_EFL_PF,
331 /* 0x8c = 10001100b */ 0,
332 /* 0x8d = 10001101b */ X86_EFL_PF,
333 /* 0x8e = 10001110b */ X86_EFL_PF,
334 /* 0x8f = 10001111b */ 0,
335 /* 0x90 = 10010000b */ X86_EFL_PF,
336 /* 0x91 = 10010001b */ 0,
337 /* 0x92 = 10010010b */ 0,
338 /* 0x93 = 10010011b */ X86_EFL_PF,
339 /* 0x94 = 10010100b */ 0,
340 /* 0x95 = 10010101b */ X86_EFL_PF,
341 /* 0x96 = 10010110b */ X86_EFL_PF,
342 /* 0x97 = 10010111b */ 0,
343 /* 0x98 = 10011000b */ 0,
344 /* 0x99 = 10011001b */ X86_EFL_PF,
345 /* 0x9a = 10011010b */ X86_EFL_PF,
346 /* 0x9b = 10011011b */ 0,
347 /* 0x9c = 10011100b */ X86_EFL_PF,
348 /* 0x9d = 10011101b */ 0,
349 /* 0x9e = 10011110b */ 0,
350 /* 0x9f = 10011111b */ X86_EFL_PF,
351 /* 0xa0 = 10100000b */ X86_EFL_PF,
352 /* 0xa1 = 10100001b */ 0,
353 /* 0xa2 = 10100010b */ 0,
354 /* 0xa3 = 10100011b */ X86_EFL_PF,
355 /* 0xa4 = 10100100b */ 0,
356 /* 0xa5 = 10100101b */ X86_EFL_PF,
357 /* 0xa6 = 10100110b */ X86_EFL_PF,
358 /* 0xa7 = 10100111b */ 0,
359 /* 0xa8 = 10101000b */ 0,
360 /* 0xa9 = 10101001b */ X86_EFL_PF,
361 /* 0xaa = 10101010b */ X86_EFL_PF,
362 /* 0xab = 10101011b */ 0,
363 /* 0xac = 10101100b */ X86_EFL_PF,
364 /* 0xad = 10101101b */ 0,
365 /* 0xae = 10101110b */ 0,
366 /* 0xaf = 10101111b */ X86_EFL_PF,
367 /* 0xb0 = 10110000b */ 0,
368 /* 0xb1 = 10110001b */ X86_EFL_PF,
369 /* 0xb2 = 10110010b */ X86_EFL_PF,
370 /* 0xb3 = 10110011b */ 0,
371 /* 0xb4 = 10110100b */ X86_EFL_PF,
372 /* 0xb5 = 10110101b */ 0,
373 /* 0xb6 = 10110110b */ 0,
374 /* 0xb7 = 10110111b */ X86_EFL_PF,
375 /* 0xb8 = 10111000b */ X86_EFL_PF,
376 /* 0xb9 = 10111001b */ 0,
377 /* 0xba = 10111010b */ 0,
378 /* 0xbb = 10111011b */ X86_EFL_PF,
379 /* 0xbc = 10111100b */ 0,
380 /* 0xbd = 10111101b */ X86_EFL_PF,
381 /* 0xbe = 10111110b */ X86_EFL_PF,
382 /* 0xbf = 10111111b */ 0,
383 /* 0xc0 = 11000000b */ X86_EFL_PF,
384 /* 0xc1 = 11000001b */ 0,
385 /* 0xc2 = 11000010b */ 0,
386 /* 0xc3 = 11000011b */ X86_EFL_PF,
387 /* 0xc4 = 11000100b */ 0,
388 /* 0xc5 = 11000101b */ X86_EFL_PF,
389 /* 0xc6 = 11000110b */ X86_EFL_PF,
390 /* 0xc7 = 11000111b */ 0,
391 /* 0xc8 = 11001000b */ 0,
392 /* 0xc9 = 11001001b */ X86_EFL_PF,
393 /* 0xca = 11001010b */ X86_EFL_PF,
394 /* 0xcb = 11001011b */ 0,
395 /* 0xcc = 11001100b */ X86_EFL_PF,
396 /* 0xcd = 11001101b */ 0,
397 /* 0xce = 11001110b */ 0,
398 /* 0xcf = 11001111b */ X86_EFL_PF,
399 /* 0xd0 = 11010000b */ 0,
400 /* 0xd1 = 11010001b */ X86_EFL_PF,
401 /* 0xd2 = 11010010b */ X86_EFL_PF,
402 /* 0xd3 = 11010011b */ 0,
403 /* 0xd4 = 11010100b */ X86_EFL_PF,
404 /* 0xd5 = 11010101b */ 0,
405 /* 0xd6 = 11010110b */ 0,
406 /* 0xd7 = 11010111b */ X86_EFL_PF,
407 /* 0xd8 = 11011000b */ X86_EFL_PF,
408 /* 0xd9 = 11011001b */ 0,
409 /* 0xda = 11011010b */ 0,
410 /* 0xdb = 11011011b */ X86_EFL_PF,
411 /* 0xdc = 11011100b */ 0,
412 /* 0xdd = 11011101b */ X86_EFL_PF,
413 /* 0xde = 11011110b */ X86_EFL_PF,
414 /* 0xdf = 11011111b */ 0,
415 /* 0xe0 = 11100000b */ 0,
416 /* 0xe1 = 11100001b */ X86_EFL_PF,
417 /* 0xe2 = 11100010b */ X86_EFL_PF,
418 /* 0xe3 = 11100011b */ 0,
419 /* 0xe4 = 11100100b */ X86_EFL_PF,
420 /* 0xe5 = 11100101b */ 0,
421 /* 0xe6 = 11100110b */ 0,
422 /* 0xe7 = 11100111b */ X86_EFL_PF,
423 /* 0xe8 = 11101000b */ X86_EFL_PF,
424 /* 0xe9 = 11101001b */ 0,
425 /* 0xea = 11101010b */ 0,
426 /* 0xeb = 11101011b */ X86_EFL_PF,
427 /* 0xec = 11101100b */ 0,
428 /* 0xed = 11101101b */ X86_EFL_PF,
429 /* 0xee = 11101110b */ X86_EFL_PF,
430 /* 0xef = 11101111b */ 0,
431 /* 0xf0 = 11110000b */ X86_EFL_PF,
432 /* 0xf1 = 11110001b */ 0,
433 /* 0xf2 = 11110010b */ 0,
434 /* 0xf3 = 11110011b */ X86_EFL_PF,
435 /* 0xf4 = 11110100b */ 0,
436 /* 0xf5 = 11110101b */ X86_EFL_PF,
437 /* 0xf6 = 11110110b */ X86_EFL_PF,
438 /* 0xf7 = 11110111b */ 0,
439 /* 0xf8 = 11111000b */ 0,
440 /* 0xf9 = 11111001b */ X86_EFL_PF,
441 /* 0xfa = 11111010b */ X86_EFL_PF,
442 /* 0xfb = 11111011b */ 0,
443 /* 0xfc = 11111100b */ X86_EFL_PF,
444 /* 0xfd = 11111101b */ 0,
445 /* 0xfe = 11111110b */ 0,
446 /* 0xff = 11111111b */ X86_EFL_PF,
447};
448
449/* for clang: */
450extern const RTFLOAT80U g_ar80Zero[];
451extern const RTFLOAT80U g_ar80One[];
452extern const RTFLOAT80U g_r80Indefinite;
453extern const RTFLOAT80U g_ar80Infinity[];
454extern const RTFLOAT128U g_r128Ln2;
455extern const RTUINT128U g_u128Ln2Mantissa;
456extern const RTUINT128U g_u128Ln2MantissaIntel;
457extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
458
459/** Zero values (indexed by fSign). */
460RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
461
462/** One values (indexed by fSign). */
463RTFLOAT80U const g_ar80One[] =
464{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
465
466/** Indefinite (negative). */
467RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
468
469/** Infinities (indexed by fSign). */
470RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
471
472#if 0
473/** 128-bit floating point constant: 2.0 */
474const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
475#endif
476
477
478/* The next section is generated by tools/IEMGenFpuConstants: */
479
480/** The ln2 constant as 128-bit floating point value.
481 * base-10: 6.93147180559945309417232121458176575e-1
482 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
483 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
484//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
485const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
486/** High precision ln2 value.
487 * base-10: 6.931471805599453094172321214581765680747e-1
488 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
489 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
490const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
491/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
492 * base-10: 6.931471805599453094151379470289064954613e-1
493 * base-16: b.17217f7d1cf79abc0000000000000000@-1
494 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
495const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
496
497/** Horner constants for f2xm1 */
498const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
499{
500 /* a0
501 * base-10: 1.00000000000000000000000000000000000e0
502 * base-16: 1.0000000000000000000000000000@0
503 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
504 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
505 /* a1
506 * base-10: 5.00000000000000000000000000000000000e-1
507 * base-16: 8.0000000000000000000000000000@-1
508 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
509 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
510 /* a2
511 * base-10: 1.66666666666666666666666666666666658e-1
512 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
513 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
514 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
515 /* a3
516 * base-10: 4.16666666666666666666666666666666646e-2
517 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
518 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
519 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
520 /* a4
521 * base-10: 8.33333333333333333333333333333333323e-3
522 * base-16: 2.2222222222222222222222222222@-2
523 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
524 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
525 /* a5
526 * base-10: 1.38888888888888888888888888888888874e-3
527 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
528 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
529 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
530 /* a6
531 * base-10: 1.98412698412698412698412698412698412e-4
532 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
533 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
534 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
535 /* a7
536 * base-10: 2.48015873015873015873015873015873015e-5
537 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
538 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
539 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
540 /* a8
541 * base-10: 2.75573192239858906525573192239858902e-6
542 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
543 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
544 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
545 /* a9
546 * base-10: 2.75573192239858906525573192239858865e-7
547 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
548 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
549 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
550 /* a10
551 * base-10: 2.50521083854417187750521083854417184e-8
552 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
553 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
554 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
555 /* a11
556 * base-10: 2.08767569878680989792100903212014296e-9
557 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
558 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
559 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
560 /* a12
561 * base-10: 1.60590438368216145993923771701549472e-10
562 * base-16: b.092309d43684be51c198e91d7b40@-9
563 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
564 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
565 /* a13
566 * base-10: 1.14707455977297247138516979786821043e-11
567 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
568 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
569 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
570 /* a14
571 * base-10: 7.64716373181981647590113198578806964e-13
572 * base-16: d.73f9f399dc0f88ec32b587746578@-11
573 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
574 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
575 /* a15
576 * base-10: 4.77947733238738529743820749111754352e-14
577 * base-16: d.73f9f399dc0f88ec32b587746578@-12
578 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
579 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
580 /* a16
581 * base-10: 2.81145725434552076319894558301031970e-15
582 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
583 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
584 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
585 /* a17
586 * base-10: 1.56192069685862264622163643500573321e-16
587 * base-16: b.413c31dcbecbbdd8024435161550@-14
588 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
589 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
590 /* a18
591 * base-10: 8.22063524662432971695598123687227980e-18
592 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
593 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
594 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
595 /* a19
596 * base-10: 4.11031762331216485847799061843614006e-19
597 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
598 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
599 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
600 /* a20
601 * base-10: 7.04351638180413298434020229233492164e-20
602 * base-16: 1.4c9ee35db1d1f3c946fdcd48fd88@-16
603 * base-2 : 1.0100110010011110111000110101110110110001110100011111001111001001010001101111110111001101010010001111110110001000e-64 */
604 RTFLOAT128U_INIT_C(0, 0x4c9ee35db1d1, 0xf3c946fdcd48fd88, 0x3fbf),
605 /* a21
606 * base-10: 5.81527769640186708776361513365257702e-20
607 * base-16: 1.129e64bff606a2b9c9fc624481cd@-16
608 * base-2 : 1.0001001010011110011001001011111111110110000001101010001010111001110010011111110001100010010001001000000111001101e-64 */
609 RTFLOAT128U_INIT_C(0, 0x129e64bff606, 0xa2b9c9fc624481cd, 0x3fbf),
610};
611
612
613/*
614 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
615 * it all in C is probably safer atm., optimize what's necessary later, maybe.
616 */
617#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
618
619
620/*********************************************************************************************************************************
621* Binary Operations *
622*********************************************************************************************************************************/
623
624/*
625 * ADD
626 */
627
628IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
629{
630 uint64_t uDst = *puDst;
631 uint64_t uResult = uDst + uSrc;
632 *puDst = uResult;
633 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
634}
635
636# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
637
638IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
639{
640 uint32_t uDst = *puDst;
641 uint32_t uResult = uDst + uSrc;
642 *puDst = uResult;
643 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
644}
645
646
647IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
648{
649 uint16_t uDst = *puDst;
650 uint16_t uResult = uDst + uSrc;
651 *puDst = uResult;
652 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
653}
654
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
657{
658 uint8_t uDst = *puDst;
659 uint8_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
662}
663
664# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
665
666/*
667 * ADC
668 */
669
670IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
671{
672 if (!(*pfEFlags & X86_EFL_CF))
673 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
674 else
675 {
676 uint64_t uDst = *puDst;
677 uint64_t uResult = uDst + uSrc + 1;
678 *puDst = uResult;
679 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
680 }
681}
682
683# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
684
685IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
686{
687 if (!(*pfEFlags & X86_EFL_CF))
688 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
689 else
690 {
691 uint32_t uDst = *puDst;
692 uint32_t uResult = uDst + uSrc + 1;
693 *puDst = uResult;
694 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
695 }
696}
697
698
699IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
700{
701 if (!(*pfEFlags & X86_EFL_CF))
702 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
703 else
704 {
705 uint16_t uDst = *puDst;
706 uint16_t uResult = uDst + uSrc + 1;
707 *puDst = uResult;
708 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
709 }
710}
711
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint8_t uDst = *puDst;
720 uint8_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
723 }
724}
725
726# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
727
728/*
729 * SUB
730 */
731
732IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
733{
734 uint64_t uDst = *puDst;
735 uint64_t uResult = uDst - uSrc;
736 *puDst = uResult;
737 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
738}
739
740# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
741
742IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
743{
744 uint32_t uDst = *puDst;
745 uint32_t uResult = uDst - uSrc;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
748}
749
750
751IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
752{
753 uint16_t uDst = *puDst;
754 uint16_t uResult = uDst - uSrc;
755 *puDst = uResult;
756 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
757}
758
759
760IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
761{
762 uint8_t uDst = *puDst;
763 uint8_t uResult = uDst - uSrc;
764 *puDst = uResult;
765 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
766}
767
768# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
769
770/*
771 * SBB
772 */
773
774IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
775{
776 if (!(*pfEFlags & X86_EFL_CF))
777 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
778 else
779 {
780 uint64_t uDst = *puDst;
781 uint64_t uResult = uDst - uSrc - 1;
782 *puDst = uResult;
783 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
784 }
785}
786
787# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
790{
791 if (!(*pfEFlags & X86_EFL_CF))
792 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
793 else
794 {
795 uint32_t uDst = *puDst;
796 uint32_t uResult = uDst - uSrc - 1;
797 *puDst = uResult;
798 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
799 }
800}
801
802
803IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
804{
805 if (!(*pfEFlags & X86_EFL_CF))
806 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
807 else
808 {
809 uint16_t uDst = *puDst;
810 uint16_t uResult = uDst - uSrc - 1;
811 *puDst = uResult;
812 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
813 }
814}
815
816
817IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
818{
819 if (!(*pfEFlags & X86_EFL_CF))
820 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
821 else
822 {
823 uint8_t uDst = *puDst;
824 uint8_t uResult = uDst - uSrc - 1;
825 *puDst = uResult;
826 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
827 }
828}
829
830# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
831
832
833/*
834 * OR
835 */
836
837IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
838{
839 uint64_t uResult = *puDst | uSrc;
840 *puDst = uResult;
841 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
842}
843
844# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
845
846IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
847{
848 uint32_t uResult = *puDst | uSrc;
849 *puDst = uResult;
850 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
851}
852
853
854IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
855{
856 uint16_t uResult = *puDst | uSrc;
857 *puDst = uResult;
858 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
859}
860
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
863{
864 uint8_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
867}
868
869# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
870
871/*
872 * XOR
873 */
874
875IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
876{
877 uint64_t uResult = *puDst ^ uSrc;
878 *puDst = uResult;
879 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
880}
881
882# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
885{
886 uint32_t uResult = *puDst ^ uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
893{
894 uint16_t uResult = *puDst ^ uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
897}
898
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
901{
902 uint8_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
905}
906
907# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
908
909/*
910 * AND
911 */
912
913IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
914{
915 uint64_t const uResult = *puDst & uSrc;
916 *puDst = uResult;
917 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
918}
919
920# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
923{
924 uint32_t const uResult = *puDst & uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
931{
932 uint16_t const uResult = *puDst & uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
935}
936
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
939{
940 uint8_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
943}
944
945# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
946#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
947
948/*
949 * ANDN (BMI1 instruction)
950 */
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
953{
954 uint64_t const uResult = ~uSrc1 & uSrc2;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
961{
962 uint32_t const uResult = ~uSrc1 & uSrc2;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
965}
966
967
968#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
969IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
970{
971 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
972}
973#endif
974
975
976#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
978{
979 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
980}
981#endif
982
983#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
984
985/*
986 * CMP
987 */
988
989IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
990{
991 uint64_t uDstTmp = *puDst;
992 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
993}
994
995# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
996
997IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
998{
999 uint32_t uDstTmp = *puDst;
1000 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1001}
1002
1003
1004IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1005{
1006 uint16_t uDstTmp = *puDst;
1007 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1008}
1009
1010
1011IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1012{
1013 uint8_t uDstTmp = *puDst;
1014 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1015}
1016
1017# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1018
1019/*
1020 * TEST
1021 */
1022
1023IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1024{
1025 uint64_t uResult = *puDst & uSrc;
1026 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1027}
1028
1029# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1030
1031IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1032{
1033 uint32_t uResult = *puDst & uSrc;
1034 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1035}
1036
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1039{
1040 uint16_t uResult = *puDst & uSrc;
1041 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1042}
1043
1044
1045IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1046{
1047 uint8_t uResult = *puDst & uSrc;
1048 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1049}
1050
1051# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1052
1053
1054/*
1055 * LOCK prefixed variants of the above
1056 */
1057
1058/** 64-bit locked binary operand operation. */
1059# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1060 do { \
1061 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1062 uint ## a_cBitsWidth ## _t uTmp; \
1063 uint32_t fEflTmp; \
1064 do \
1065 { \
1066 uTmp = uOld; \
1067 fEflTmp = *pfEFlags; \
1068 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1069 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1070 *pfEFlags = fEflTmp; \
1071 } while (0)
1072
1073
1074#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1075 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1076 uint ## a_cBitsWidth ## _t uSrc, \
1077 uint32_t *pfEFlags)) \
1078 { \
1079 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1080 }
1081
1082EMIT_LOCKED_BIN_OP(add, 64)
1083EMIT_LOCKED_BIN_OP(adc, 64)
1084EMIT_LOCKED_BIN_OP(sub, 64)
1085EMIT_LOCKED_BIN_OP(sbb, 64)
1086EMIT_LOCKED_BIN_OP(or, 64)
1087EMIT_LOCKED_BIN_OP(xor, 64)
1088EMIT_LOCKED_BIN_OP(and, 64)
1089# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1090EMIT_LOCKED_BIN_OP(add, 32)
1091EMIT_LOCKED_BIN_OP(adc, 32)
1092EMIT_LOCKED_BIN_OP(sub, 32)
1093EMIT_LOCKED_BIN_OP(sbb, 32)
1094EMIT_LOCKED_BIN_OP(or, 32)
1095EMIT_LOCKED_BIN_OP(xor, 32)
1096EMIT_LOCKED_BIN_OP(and, 32)
1097
1098EMIT_LOCKED_BIN_OP(add, 16)
1099EMIT_LOCKED_BIN_OP(adc, 16)
1100EMIT_LOCKED_BIN_OP(sub, 16)
1101EMIT_LOCKED_BIN_OP(sbb, 16)
1102EMIT_LOCKED_BIN_OP(or, 16)
1103EMIT_LOCKED_BIN_OP(xor, 16)
1104EMIT_LOCKED_BIN_OP(and, 16)
1105
1106EMIT_LOCKED_BIN_OP(add, 8)
1107EMIT_LOCKED_BIN_OP(adc, 8)
1108EMIT_LOCKED_BIN_OP(sub, 8)
1109EMIT_LOCKED_BIN_OP(sbb, 8)
1110EMIT_LOCKED_BIN_OP(or, 8)
1111EMIT_LOCKED_BIN_OP(xor, 8)
1112EMIT_LOCKED_BIN_OP(and, 8)
1113# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1114
1115
1116/*
1117 * Bit operations (same signature as above).
1118 */
1119
1120/*
1121 * BT
1122 */
1123
1124IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1125{
1126 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1127 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1128 Assert(uSrc < 64);
1129 uint64_t uDst = *puDst;
1130 if (uDst & RT_BIT_64(uSrc))
1131 *pfEFlags |= X86_EFL_CF;
1132 else
1133 *pfEFlags &= ~X86_EFL_CF;
1134}
1135
1136# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1137
1138IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1139{
1140 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1141 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1142 Assert(uSrc < 32);
1143 uint32_t uDst = *puDst;
1144 if (uDst & RT_BIT_32(uSrc))
1145 *pfEFlags |= X86_EFL_CF;
1146 else
1147 *pfEFlags &= ~X86_EFL_CF;
1148}
1149
1150IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1151{
1152 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1153 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1154 Assert(uSrc < 16);
1155 uint16_t uDst = *puDst;
1156 if (uDst & RT_BIT_32(uSrc))
1157 *pfEFlags |= X86_EFL_CF;
1158 else
1159 *pfEFlags &= ~X86_EFL_CF;
1160}
1161
1162# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1163
1164/*
1165 * BTC
1166 */
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 64);
1173 uint64_t fMask = RT_BIT_64(uSrc);
1174 uint64_t uDst = *puDst;
1175 if (uDst & fMask)
1176 {
1177 uDst &= ~fMask;
1178 *puDst = uDst;
1179 *pfEFlags |= X86_EFL_CF;
1180 }
1181 else
1182 {
1183 uDst |= fMask;
1184 *puDst = uDst;
1185 *pfEFlags &= ~X86_EFL_CF;
1186 }
1187}
1188
1189# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1190
1191IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1192{
1193 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1194 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1195 Assert(uSrc < 32);
1196 uint32_t fMask = RT_BIT_32(uSrc);
1197 uint32_t uDst = *puDst;
1198 if (uDst & fMask)
1199 {
1200 uDst &= ~fMask;
1201 *puDst = uDst;
1202 *pfEFlags |= X86_EFL_CF;
1203 }
1204 else
1205 {
1206 uDst |= fMask;
1207 *puDst = uDst;
1208 *pfEFlags &= ~X86_EFL_CF;
1209 }
1210}
1211
1212
1213IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1214{
1215 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1216 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1217 Assert(uSrc < 16);
1218 uint16_t fMask = RT_BIT_32(uSrc);
1219 uint16_t uDst = *puDst;
1220 if (uDst & fMask)
1221 {
1222 uDst &= ~fMask;
1223 *puDst = uDst;
1224 *pfEFlags |= X86_EFL_CF;
1225 }
1226 else
1227 {
1228 uDst |= fMask;
1229 *puDst = uDst;
1230 *pfEFlags &= ~X86_EFL_CF;
1231 }
1232}
1233
1234# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1235
1236/*
1237 * BTR
1238 */
1239
1240IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1241{
1242 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1243 logical operation (AND/OR/whatever). */
1244 Assert(uSrc < 64);
1245 uint64_t fMask = RT_BIT_64(uSrc);
1246 uint64_t uDst = *puDst;
1247 if (uDst & fMask)
1248 {
1249 uDst &= ~fMask;
1250 *puDst = uDst;
1251 *pfEFlags |= X86_EFL_CF;
1252 }
1253 else
1254 *pfEFlags &= ~X86_EFL_CF;
1255}
1256
1257# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1258
1259IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1260{
1261 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1262 logical operation (AND/OR/whatever). */
1263 Assert(uSrc < 32);
1264 uint32_t fMask = RT_BIT_32(uSrc);
1265 uint32_t uDst = *puDst;
1266 if (uDst & fMask)
1267 {
1268 uDst &= ~fMask;
1269 *puDst = uDst;
1270 *pfEFlags |= X86_EFL_CF;
1271 }
1272 else
1273 *pfEFlags &= ~X86_EFL_CF;
1274}
1275
1276
1277IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1278{
1279 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1280 logical operation (AND/OR/whatever). */
1281 Assert(uSrc < 16);
1282 uint16_t fMask = RT_BIT_32(uSrc);
1283 uint16_t uDst = *puDst;
1284 if (uDst & fMask)
1285 {
1286 uDst &= ~fMask;
1287 *puDst = uDst;
1288 *pfEFlags |= X86_EFL_CF;
1289 }
1290 else
1291 *pfEFlags &= ~X86_EFL_CF;
1292}
1293
1294# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1295
1296/*
1297 * BTS
1298 */
1299
1300IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1301{
1302 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1303 logical operation (AND/OR/whatever). */
1304 Assert(uSrc < 64);
1305 uint64_t fMask = RT_BIT_64(uSrc);
1306 uint64_t uDst = *puDst;
1307 if (uDst & fMask)
1308 *pfEFlags |= X86_EFL_CF;
1309 else
1310 {
1311 uDst |= fMask;
1312 *puDst = uDst;
1313 *pfEFlags &= ~X86_EFL_CF;
1314 }
1315}
1316
1317# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1318
1319IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1320{
1321 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1322 logical operation (AND/OR/whatever). */
1323 Assert(uSrc < 32);
1324 uint32_t fMask = RT_BIT_32(uSrc);
1325 uint32_t uDst = *puDst;
1326 if (uDst & fMask)
1327 *pfEFlags |= X86_EFL_CF;
1328 else
1329 {
1330 uDst |= fMask;
1331 *puDst = uDst;
1332 *pfEFlags &= ~X86_EFL_CF;
1333 }
1334}
1335
1336
1337IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1338{
1339 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1340 logical operation (AND/OR/whatever). */
1341 Assert(uSrc < 16);
1342 uint16_t fMask = RT_BIT_32(uSrc);
1343 uint32_t uDst = *puDst;
1344 if (uDst & fMask)
1345 *pfEFlags |= X86_EFL_CF;
1346 else
1347 {
1348 uDst |= fMask;
1349 *puDst = uDst;
1350 *pfEFlags &= ~X86_EFL_CF;
1351 }
1352}
1353
1354# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1355
1356
1357EMIT_LOCKED_BIN_OP(btc, 64)
1358EMIT_LOCKED_BIN_OP(btr, 64)
1359EMIT_LOCKED_BIN_OP(bts, 64)
1360# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1361EMIT_LOCKED_BIN_OP(btc, 32)
1362EMIT_LOCKED_BIN_OP(btr, 32)
1363EMIT_LOCKED_BIN_OP(bts, 32)
1364
1365EMIT_LOCKED_BIN_OP(btc, 16)
1366EMIT_LOCKED_BIN_OP(btr, 16)
1367EMIT_LOCKED_BIN_OP(bts, 16)
1368# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1369
1370
1371/*
1372 * Helpers for BSR and BSF.
1373 *
1374 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1375 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1376 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1377 * but we restrict ourselves to emulating these recent marchs.
1378 */
1379#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1380 unsigned iBit = (a_iBit); \
1381 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1382 if (iBit) \
1383 { \
1384 *puDst = --iBit; \
1385 fEfl |= g_afParity[iBit]; \
1386 } \
1387 else \
1388 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1389 *pfEFlags = fEfl; \
1390 } while (0)
1391#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1392 unsigned const iBit = (a_iBit); \
1393 if (iBit) \
1394 { \
1395 *puDst = iBit - 1; \
1396 *pfEFlags &= ~X86_EFL_ZF; \
1397 } \
1398 else \
1399 *pfEFlags |= X86_EFL_ZF; \
1400 } while (0)
1401
1402
1403/*
1404 * BSF - first (least significant) bit set
1405 */
1406IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1407{
1408 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1409}
1410
1411IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1412{
1413 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1414}
1415
1416IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1417{
1418 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1419}
1420
1421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1422
1423IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1424{
1425 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1426}
1427
1428IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1429{
1430 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1431}
1432
1433IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1434{
1435 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1436}
1437
1438
1439IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1440{
1441 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1442}
1443
1444IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1445{
1446 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1447}
1448
1449IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1450{
1451 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1452}
1453
1454# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1455
1456
1457/*
1458 * BSR - last (most significant) bit set
1459 */
1460IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1461{
1462 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1463}
1464
1465IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1466{
1467 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1468}
1469
1470IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1471{
1472 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1473}
1474
1475# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1476
1477IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1478{
1479 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1480}
1481
1482IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1483{
1484 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1485}
1486
1487IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1488{
1489 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1490}
1491
1492
1493IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1494{
1495 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1496}
1497
1498IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1499{
1500 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1501}
1502
1503IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1504{
1505 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1506}
1507
1508# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1509
1510
1511/*
1512 * Helpers for LZCNT and TZCNT.
1513 */
1514#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1515 unsigned const uResult = (a_uResult); \
1516 *(a_puDst) = uResult; \
1517 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1518 if (uResult) \
1519 fEfl |= g_afParity[uResult]; \
1520 else \
1521 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1522 if (!a_uSrc) \
1523 fEfl |= X86_EFL_CF; \
1524 *(a_pfEFlags) = fEfl; \
1525 } while (0)
1526#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1527 unsigned const uResult = (a_uResult); \
1528 *(a_puDst) = uResult; \
1529 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1530 if (!uResult) \
1531 fEfl |= X86_EFL_ZF; \
1532 if (!a_uSrc) \
1533 fEfl |= X86_EFL_CF; \
1534 *(a_pfEFlags) = fEfl; \
1535 } while (0)
1536
1537
1538/*
1539 * LZCNT - count leading zero bits.
1540 */
1541IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1542{
1543 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1544}
1545
1546IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1547{
1548 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1549}
1550
1551IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1552{
1553 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1554}
1555
1556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1557
1558IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1559{
1560 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1561}
1562
1563IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1564{
1565 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1566}
1567
1568IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1569{
1570 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1571}
1572
1573
1574IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1575{
1576 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1577}
1578
1579IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1580{
1581 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1582}
1583
1584IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1585{
1586 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1587}
1588
1589# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1590
1591
1592/*
1593 * TZCNT - count leading zero bits.
1594 */
1595IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1596{
1597 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1598}
1599
1600IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1601{
1602 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1603}
1604
1605IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1606{
1607 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1608}
1609
1610# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1611
1612IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1613{
1614 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1615}
1616
1617IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1618{
1619 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1620}
1621
1622IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1623{
1624 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1625}
1626
1627
1628IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1629{
1630 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1631}
1632
1633IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1634{
1635 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1636}
1637
1638IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1639{
1640 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1641}
1642
1643# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1644#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1645
1646/*
1647 * BEXTR (BMI1 instruction)
1648 */
1649#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1650IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1651 a_Type uSrc2, uint32_t *pfEFlags)) \
1652{ \
1653 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1654 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1655 a_Type uResult; \
1656 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1657 if (iFirstBit < a_cBits) \
1658 { \
1659 uResult = uSrc1 >> iFirstBit; \
1660 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1661 if (cBits < a_cBits) \
1662 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1663 *puDst = uResult; \
1664 if (!uResult) \
1665 fEfl |= X86_EFL_ZF; \
1666 } \
1667 else \
1668 { \
1669 *puDst = uResult = 0; \
1670 fEfl |= X86_EFL_ZF; \
1671 } \
1672 /** @todo complete flag calculations. */ \
1673 *pfEFlags = fEfl; \
1674}
1675
1676EMIT_BEXTR(64, uint64_t, _fallback)
1677EMIT_BEXTR(32, uint32_t, _fallback)
1678#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1679EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1680#endif
1681#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1682EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1683#endif
1684
1685/*
1686 * BLSR (BMI1 instruction)
1687 */
1688#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1689IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1690{ \
1691 uint32_t fEfl1 = *pfEFlags; \
1692 uint32_t fEfl2 = fEfl1; \
1693 *puDst = uSrc; \
1694 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1695 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1696 \
1697 /* AMD: The carry flag is from the SUB operation. */ \
1698 /* 10890xe: PF always cleared? */ \
1699 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1700 fEfl2 |= fEfl1 & X86_EFL_CF; \
1701 *pfEFlags = fEfl2; \
1702}
1703
1704EMIT_BLSR(64, uint64_t, _fallback)
1705EMIT_BLSR(32, uint32_t, _fallback)
1706#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BLSR(64, uint64_t, RT_NOTHING)
1708#endif
1709#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1710EMIT_BLSR(32, uint32_t, RT_NOTHING)
1711#endif
1712
1713/*
1714 * BLSMSK (BMI1 instruction)
1715 */
1716#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1717IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1718{ \
1719 uint32_t fEfl1 = *pfEFlags; \
1720 uint32_t fEfl2 = fEfl1; \
1721 *puDst = uSrc; \
1722 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1723 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1724 \
1725 /* AMD: The carry flag is from the SUB operation. */ \
1726 /* 10890xe: PF always cleared? */ \
1727 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1728 fEfl2 |= fEfl1 & X86_EFL_CF; \
1729 *pfEFlags = fEfl2; \
1730}
1731
1732EMIT_BLSMSK(64, uint64_t, _fallback)
1733EMIT_BLSMSK(32, uint32_t, _fallback)
1734#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1736#endif
1737#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1738EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1739#endif
1740
1741/*
1742 * BLSI (BMI1 instruction)
1743 */
1744#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1745IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1746{ \
1747 uint32_t fEfl1 = *pfEFlags; \
1748 uint32_t fEfl2 = fEfl1; \
1749 *puDst = uSrc; \
1750 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1751 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1752 \
1753 /* AMD: The carry flag is from the SUB operation. */ \
1754 /* 10890xe: PF always cleared? */ \
1755 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1756 fEfl2 |= fEfl1 & X86_EFL_CF; \
1757 *pfEFlags = fEfl2; \
1758}
1759
1760EMIT_BLSI(64, uint64_t, _fallback)
1761EMIT_BLSI(32, uint32_t, _fallback)
1762#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSI(64, uint64_t, RT_NOTHING)
1764#endif
1765#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1766EMIT_BLSI(32, uint32_t, RT_NOTHING)
1767#endif
1768
1769/*
1770 * BZHI (BMI2 instruction)
1771 */
1772#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1773IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1774 a_Type uSrc2, uint32_t *pfEFlags)) \
1775{ \
1776 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1777 a_Type uResult; \
1778 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1779 if (iFirstBit < a_cBits) \
1780 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1781 else \
1782 { \
1783 uResult = uSrc1; \
1784 fEfl |= X86_EFL_CF; \
1785 } \
1786 *puDst = uResult; \
1787 fEfl |= X86_EFL_CALC_ZF(uResult); \
1788 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1789 *pfEFlags = fEfl; \
1790}
1791
1792EMIT_BZHI(64, uint64_t, _fallback)
1793EMIT_BZHI(32, uint32_t, _fallback)
1794#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1795EMIT_BZHI(64, uint64_t, RT_NOTHING)
1796#endif
1797#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1798EMIT_BZHI(32, uint32_t, RT_NOTHING)
1799#endif
1800
1801/*
1802 * POPCNT
1803 */
1804RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1805{
1806 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1807 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1808 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1809 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1810};
1811
1812/** @todo Use native popcount where possible and employ some more efficient
1813 * algorithm here (or in asm.h fallback)! */
1814
1815DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1816{
1817 return g_abBitCounts6[ u16 & 0x3f]
1818 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1819 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1820}
1821
1822DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1823{
1824 return g_abBitCounts6[ u32 & 0x3f]
1825 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1826 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1827 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1828 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1829 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1830}
1831
1832DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1833{
1834 return g_abBitCounts6[ u64 & 0x3f]
1835 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1836 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1837 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1838 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1839 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1840 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1841 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1842 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1843 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1844 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1845}
1846
1847#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1848IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1849{ \
1850 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1851 a_Type uResult; \
1852 if (uSrc) \
1853 uResult = iemPopCountU ## a_cBits(uSrc); \
1854 else \
1855 { \
1856 fEfl |= X86_EFL_ZF; \
1857 uResult = 0; \
1858 } \
1859 *puDst = uResult; \
1860 *pfEFlags = fEfl; \
1861}
1862
1863EMIT_POPCNT(64, uint64_t, _fallback)
1864EMIT_POPCNT(32, uint32_t, _fallback)
1865EMIT_POPCNT(16, uint16_t, _fallback)
1866#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1867EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1868#endif
1869#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1870EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1871EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1872#endif
1873
1874
1875#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1876
1877/*
1878 * XCHG
1879 */
1880
1881IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1882{
1883#if ARCH_BITS >= 64
1884 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1885#else
1886 uint64_t uOldMem = *puMem;
1887 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1888 ASMNopPause();
1889 *puReg = uOldMem;
1890#endif
1891}
1892
1893# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1894
1895IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1896{
1897 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1898}
1899
1900
1901IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1902{
1903 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1904}
1905
1906
1907IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1908{
1909 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1910}
1911
1912# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1913
1914
1915/* Unlocked variants for fDisregardLock mode: */
1916
1917IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1918{
1919 uint64_t const uOld = *puMem;
1920 *puMem = *puReg;
1921 *puReg = uOld;
1922}
1923
1924# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1927{
1928 uint32_t const uOld = *puMem;
1929 *puMem = *puReg;
1930 *puReg = uOld;
1931}
1932
1933
1934IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1935{
1936 uint16_t const uOld = *puMem;
1937 *puMem = *puReg;
1938 *puReg = uOld;
1939}
1940
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1943{
1944 uint8_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1950
1951
1952/*
1953 * XADD and LOCK XADD.
1954 */
1955#define EMIT_XADD(a_cBitsWidth, a_Type) \
1956IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1957{ \
1958 a_Type uDst = *puDst; \
1959 a_Type uResult = uDst; \
1960 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1961 *puDst = uResult; \
1962 *puReg = uDst; \
1963} \
1964\
1965IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1966{ \
1967 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1968 a_Type uResult; \
1969 uint32_t fEflTmp; \
1970 do \
1971 { \
1972 uResult = uOld; \
1973 fEflTmp = *pfEFlags; \
1974 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
1975 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
1976 *puReg = uOld; \
1977 *pfEFlags = fEflTmp; \
1978}
1979EMIT_XADD(64, uint64_t)
1980# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1981EMIT_XADD(32, uint32_t)
1982EMIT_XADD(16, uint16_t)
1983EMIT_XADD(8, uint8_t)
1984# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1985
1986#endif
1987
1988/*
1989 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
1990 *
1991 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
1992 * instructions are emulated as locked.
1993 */
1994#if defined(IEM_WITHOUT_ASSEMBLY)
1995
1996IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
1997{
1998 uint8_t uOld = *puAl;
1999 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2000 Assert(*puAl == uOld);
2001 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2002}
2003
2004
2005IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2006{
2007 uint16_t uOld = *puAx;
2008 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2009 Assert(*puAx == uOld);
2010 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2011}
2012
2013
2014IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2015{
2016 uint32_t uOld = *puEax;
2017 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2018 Assert(*puEax == uOld);
2019 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2020}
2021
2022
2023# if ARCH_BITS == 32
2024IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2025# else
2026IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2027# endif
2028{
2029# if ARCH_BITS == 32
2030 uint64_t const uSrcReg = *puSrcReg;
2031# endif
2032 uint64_t uOld = *puRax;
2033 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2034 Assert(*puRax == uOld);
2035 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2040 uint32_t *pEFlags))
2041{
2042 uint64_t const uNew = pu64EbxEcx->u;
2043 uint64_t const uOld = pu64EaxEdx->u;
2044 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2045 {
2046 Assert(pu64EaxEdx->u == uOld);
2047 *pEFlags |= X86_EFL_ZF;
2048 }
2049 else
2050 *pEFlags &= ~X86_EFL_ZF;
2051}
2052
2053
2054# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2055IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2056 uint32_t *pEFlags))
2057{
2058# ifdef VBOX_STRICT
2059 RTUINT128U const uOld = *pu128RaxRdx;
2060# endif
2061# if defined(RT_ARCH_AMD64)
2062 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2063 &pu128RaxRdx->u))
2064# else
2065 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2066# endif
2067 {
2068 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2069 *pEFlags |= X86_EFL_ZF;
2070 }
2071 else
2072 *pEFlags &= ~X86_EFL_ZF;
2073}
2074# endif
2075
2076#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2077
2078# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2079IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2080 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2081{
2082 RTUINT128U u128Tmp = *pu128Dst;
2083 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2084 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2085 {
2086 *pu128Dst = *pu128RbxRcx;
2087 *pEFlags |= X86_EFL_ZF;
2088 }
2089 else
2090 {
2091 *pu128RaxRdx = u128Tmp;
2092 *pEFlags &= ~X86_EFL_ZF;
2093 }
2094}
2095#endif /* !RT_ARCH_ARM64 */
2096
2097#if defined(IEM_WITHOUT_ASSEMBLY)
2098
2099/* Unlocked versions mapped to the locked ones: */
2100
2101IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2102{
2103 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2104}
2105
2106
2107IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2108{
2109 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2110}
2111
2112
2113IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2114{
2115 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2116}
2117
2118
2119# if ARCH_BITS == 32
2120IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2121{
2122 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2123}
2124# else
2125IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2126{
2127 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2128}
2129# endif
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2139 uint32_t *pEFlags))
2140{
2141 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2142}
2143
2144#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2145
2146#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2147 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2148
2149/*
2150 * MUL, IMUL, DIV and IDIV helpers.
2151 *
2152 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2153 * division step so we can select between using C operators and
2154 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2155 *
2156 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2157 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2158 * input loads and the result storing.
2159 */
2160
2161DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2162{
2163# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2164 pQuotient->s.Lo = 0;
2165 pQuotient->s.Hi = 0;
2166# endif
2167 RTUINT128U Divisor;
2168 Divisor.s.Lo = u64Divisor;
2169 Divisor.s.Hi = 0;
2170 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2171}
2172
2173# define DIV_LOAD(a_Dividend) \
2174 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2175# define DIV_LOAD_U8(a_Dividend) \
2176 a_Dividend.u = *puAX
2177
2178# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2179# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2180
2181# define MUL_LOAD_F1() *puA
2182# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2183
2184# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2185# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2186
2187# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2188 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2189# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2190 RTUInt128AssignNeg(&(a_Value))
2191
2192# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2193 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2194# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2195 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2196
2197# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2198 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2199 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2200# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2201 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2202
2203
2204/*
2205 * MUL
2206 */
2207# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2208IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2209{ \
2210 RTUINT ## a_cBitsWidth2x ## U Result; \
2211 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2212 a_fnStore(Result); \
2213 \
2214 /* Calc EFLAGS: */ \
2215 uint32_t fEfl = *pfEFlags; \
2216 if (a_fIntelFlags) \
2217 { /* Intel: 6700K and 10980XE behavior */ \
2218 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2219 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2220 fEfl |= X86_EFL_SF; \
2221 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2222 if (Result.s.Hi != 0) \
2223 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2224 } \
2225 else \
2226 { /* AMD: 3990X */ \
2227 if (Result.s.Hi != 0) \
2228 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2229 else \
2230 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2231 } \
2232 *pfEFlags = fEfl; \
2233 return 0; \
2234} \
2235
2236# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2237 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2238 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2239 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2240
2241# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2242EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2243 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2244# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2245EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2246 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2247EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2248 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2249EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2250 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2251# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2252# endif /* !DOXYGEN_RUNNING */
2253
2254/*
2255 * MULX
2256 */
2257# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2258IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2259 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2260{ \
2261 RTUINT ## a_cBitsWidth2x ## U Result; \
2262 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2263 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2264 *puDst1 = Result.s.Hi; \
2265} \
2266
2267# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2268EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2269EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2270# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2271EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2272EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2273# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2274# endif /* !DOXYGEN_RUNNING */
2275
2276
2277/*
2278 * IMUL
2279 *
2280 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2281 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2282 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2283 */
2284# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2285 a_Suffix, a_fIntelFlags) \
2286IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2287{ \
2288 RTUINT ## a_cBitsWidth2x ## U Result; \
2289 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2290 \
2291 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2292 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2293 { \
2294 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2295 { \
2296 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2297 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2298 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2299 } \
2300 else \
2301 { \
2302 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2303 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2304 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2305 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2306 a_fnNeg(Result, a_cBitsWidth2x); \
2307 } \
2308 } \
2309 else \
2310 { \
2311 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2312 { \
2313 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2314 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2315 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2316 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2317 a_fnNeg(Result, a_cBitsWidth2x); \
2318 } \
2319 else \
2320 { \
2321 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2322 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2323 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2324 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2325 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2326 } \
2327 } \
2328 a_fnStore(Result); \
2329 \
2330 if (a_fIntelFlags) \
2331 { \
2332 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2333 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2334 fEfl |= X86_EFL_SF; \
2335 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2336 } \
2337 *pfEFlags = fEfl; \
2338 return 0; \
2339}
2340# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2341 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2342 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2343 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2344
2345# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2346EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2347 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2348# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2349EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2350 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2351EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2352 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2353EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2354 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2355# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2356# endif /* !DOXYGEN_RUNNING */
2357
2358
2359/*
2360 * IMUL with two operands are mapped onto the three operand variant, ignoring
2361 * the high part of the product.
2362 */
2363# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2364IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2365{ \
2366 a_uType uIgn; \
2367 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2368} \
2369\
2370IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2371{ \
2372 a_uType uIgn; \
2373 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2374} \
2375\
2376IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2377{ \
2378 a_uType uIgn; \
2379 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2380}
2381
2382EMIT_IMUL_TWO(64, uint64_t)
2383# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2384EMIT_IMUL_TWO(32, uint32_t)
2385EMIT_IMUL_TWO(16, uint16_t)
2386# endif
2387
2388
2389/*
2390 * DIV
2391 */
2392# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2393 a_Suffix, a_fIntelFlags) \
2394IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2395{ \
2396 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2397 a_fnLoad(Dividend); \
2398 if ( uDivisor != 0 \
2399 && Dividend.s.Hi < uDivisor) \
2400 { \
2401 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2402 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2403 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2404 \
2405 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2406 if (!a_fIntelFlags) \
2407 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2408 return 0; \
2409 } \
2410 /* #DE */ \
2411 return -1; \
2412}
2413# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2414 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2415 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2416 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2417
2418# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2419EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2420 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2422EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2423 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2424EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2425 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2426EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2427 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2428# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2429# endif /* !DOXYGEN_RUNNING */
2430
2431
2432/*
2433 * IDIV
2434 *
2435 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2436 * set AF and clear PF, ZF and SF just like it does for DIV.
2437 *
2438 */
2439# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2440 a_Suffix, a_fIntelFlags) \
2441IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2442{ \
2443 /* Note! Skylake leaves all flags alone. */ \
2444 \
2445 /** @todo overflow checks */ \
2446 if (uDivisor != 0) \
2447 { \
2448 /* \
2449 * Convert to unsigned division. \
2450 */ \
2451 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2452 a_fnLoad(Dividend); \
2453 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2454 if (fSignedDividend) \
2455 a_fnNeg(Dividend, a_cBitsWidth2x); \
2456 \
2457 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2458 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2459 uDivisorPositive = uDivisor; \
2460 else \
2461 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2462 \
2463 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2464 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2465 \
2466 /* \
2467 * Setup the result, checking for overflows. \
2468 */ \
2469 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2470 { \
2471 if (!fSignedDividend) \
2472 { \
2473 /* Positive divisor, positive dividend => result positive. */ \
2474 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2475 { \
2476 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2477 if (!a_fIntelFlags) \
2478 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2479 return 0; \
2480 } \
2481 } \
2482 else \
2483 { \
2484 /* Positive divisor, negative dividend => result negative. */ \
2485 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2486 { \
2487 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2488 if (!a_fIntelFlags) \
2489 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2490 return 0; \
2491 } \
2492 } \
2493 } \
2494 else \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2500 { \
2501 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2511 { \
2512 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 } \
2520 /* #DE */ \
2521 return -1; \
2522}
2523# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2524 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2525 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2526 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2527
2528# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2529EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2530 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2531# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2532EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2533 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2534EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2535 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2536EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2537 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2539# endif /* !DOXYGEN_RUNNING */
2540
2541#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2542
2543
2544/*********************************************************************************************************************************
2545* Unary operations. *
2546*********************************************************************************************************************************/
2547#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2548
2549/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2550 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2551 *
2552 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2553 * borrowing in arithmetic loops on intel 8008).
2554 *
2555 * @returns Status bits.
2556 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2557 * @param a_uResult Unsigned result value.
2558 * @param a_uDst The original destination value (for AF calc).
2559 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2560 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2561 */
2562#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2563 do { \
2564 uint32_t fEflTmp = *(a_pfEFlags); \
2565 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2566 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2567 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2568 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2569 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2570 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2571 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2572 *(a_pfEFlags) = fEflTmp; \
2573 } while (0)
2574
2575/*
2576 * INC
2577 */
2578
2579IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2580{
2581 uint64_t uDst = *puDst;
2582 uint64_t uResult = uDst + 1;
2583 *puDst = uResult;
2584 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2585}
2586
2587# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2588
2589IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2590{
2591 uint32_t uDst = *puDst;
2592 uint32_t uResult = uDst + 1;
2593 *puDst = uResult;
2594 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2595}
2596
2597
2598IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2599{
2600 uint16_t uDst = *puDst;
2601 uint16_t uResult = uDst + 1;
2602 *puDst = uResult;
2603 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2604}
2605
2606IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2607{
2608 uint8_t uDst = *puDst;
2609 uint8_t uResult = uDst + 1;
2610 *puDst = uResult;
2611 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2612}
2613
2614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2615
2616
2617/*
2618 * DEC
2619 */
2620
2621IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2622{
2623 uint64_t uDst = *puDst;
2624 uint64_t uResult = uDst - 1;
2625 *puDst = uResult;
2626 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2627}
2628
2629# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint32_t uDst = *puDst;
2634 uint32_t uResult = uDst - 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2637}
2638
2639
2640IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2641{
2642 uint16_t uDst = *puDst;
2643 uint16_t uResult = uDst - 1;
2644 *puDst = uResult;
2645 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2646}
2647
2648
2649IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2650{
2651 uint8_t uDst = *puDst;
2652 uint8_t uResult = uDst - 1;
2653 *puDst = uResult;
2654 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2655}
2656
2657# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2658
2659
2660/*
2661 * NOT
2662 */
2663
2664IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2665{
2666 uint64_t uDst = *puDst;
2667 uint64_t uResult = ~uDst;
2668 *puDst = uResult;
2669 /* EFLAGS are not modified. */
2670 RT_NOREF_PV(pfEFlags);
2671}
2672
2673# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2674
2675IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2676{
2677 uint32_t uDst = *puDst;
2678 uint32_t uResult = ~uDst;
2679 *puDst = uResult;
2680 /* EFLAGS are not modified. */
2681 RT_NOREF_PV(pfEFlags);
2682}
2683
2684IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2685{
2686 uint16_t uDst = *puDst;
2687 uint16_t uResult = ~uDst;
2688 *puDst = uResult;
2689 /* EFLAGS are not modified. */
2690 RT_NOREF_PV(pfEFlags);
2691}
2692
2693IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2694{
2695 uint8_t uDst = *puDst;
2696 uint8_t uResult = ~uDst;
2697 *puDst = uResult;
2698 /* EFLAGS are not modified. */
2699 RT_NOREF_PV(pfEFlags);
2700}
2701
2702# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2703
2704
2705/*
2706 * NEG
2707 */
2708
2709/**
2710 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2711 *
2712 * @returns Status bits.
2713 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2714 * @param a_uResult Unsigned result value.
2715 * @param a_uDst The original destination value (for AF calc).
2716 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2717 */
2718#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2719 do { \
2720 uint32_t fEflTmp = *(a_pfEFlags); \
2721 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2722 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2723 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2724 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2725 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2726 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2727 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2728 *(a_pfEFlags) = fEflTmp; \
2729 } while (0)
2730
2731IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2732{
2733 uint64_t uDst = *puDst;
2734 uint64_t uResult = (uint64_t)0 - uDst;
2735 *puDst = uResult;
2736 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2737}
2738
2739# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2740
2741IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2742{
2743 uint32_t uDst = *puDst;
2744 uint32_t uResult = (uint32_t)0 - uDst;
2745 *puDst = uResult;
2746 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2747}
2748
2749
2750IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2751{
2752 uint16_t uDst = *puDst;
2753 uint16_t uResult = (uint16_t)0 - uDst;
2754 *puDst = uResult;
2755 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2756}
2757
2758
2759IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2760{
2761 uint8_t uDst = *puDst;
2762 uint8_t uResult = (uint8_t)0 - uDst;
2763 *puDst = uResult;
2764 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2765}
2766
2767# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2768
2769/*
2770 * Locked variants.
2771 */
2772
2773/** Emit a function for doing a locked unary operand operation. */
2774# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2775 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2776 uint32_t *pfEFlags)) \
2777 { \
2778 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2779 uint ## a_cBitsWidth ## _t uTmp; \
2780 uint32_t fEflTmp; \
2781 do \
2782 { \
2783 uTmp = uOld; \
2784 fEflTmp = *pfEFlags; \
2785 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2786 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2787 *pfEFlags = fEflTmp; \
2788 }
2789
2790EMIT_LOCKED_UNARY_OP(inc, 64)
2791EMIT_LOCKED_UNARY_OP(dec, 64)
2792EMIT_LOCKED_UNARY_OP(not, 64)
2793EMIT_LOCKED_UNARY_OP(neg, 64)
2794# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2795EMIT_LOCKED_UNARY_OP(inc, 32)
2796EMIT_LOCKED_UNARY_OP(dec, 32)
2797EMIT_LOCKED_UNARY_OP(not, 32)
2798EMIT_LOCKED_UNARY_OP(neg, 32)
2799
2800EMIT_LOCKED_UNARY_OP(inc, 16)
2801EMIT_LOCKED_UNARY_OP(dec, 16)
2802EMIT_LOCKED_UNARY_OP(not, 16)
2803EMIT_LOCKED_UNARY_OP(neg, 16)
2804
2805EMIT_LOCKED_UNARY_OP(inc, 8)
2806EMIT_LOCKED_UNARY_OP(dec, 8)
2807EMIT_LOCKED_UNARY_OP(not, 8)
2808EMIT_LOCKED_UNARY_OP(neg, 8)
2809# endif
2810
2811#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2812
2813
2814/*********************************************************************************************************************************
2815* Shifting and Rotating *
2816*********************************************************************************************************************************/
2817
2818/*
2819 * ROL
2820 */
2821#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2822IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2823{ \
2824 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2825 if (cShift) \
2826 { \
2827 if (a_cBitsWidth < 32) \
2828 cShift &= a_cBitsWidth - 1; \
2829 a_uType const uDst = *puDst; \
2830 a_uType const uResult = a_fnHlp(uDst, cShift); \
2831 *puDst = uResult; \
2832 \
2833 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2834 it the same way as for 1 bit shifts. */ \
2835 AssertCompile(X86_EFL_CF_BIT == 0); \
2836 uint32_t fEfl = *pfEFlags; \
2837 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2838 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2839 fEfl |= fCarry; \
2840 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2841 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2842 else /* Intel 10980XE: According to the first sub-shift: */ \
2843 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2844 *pfEFlags = fEfl; \
2845 } \
2846}
2847
2848#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2849EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2850#endif
2851EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2852EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2853
2854#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2855EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2856#endif
2857EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2858EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2859
2860DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2861{
2862 return (uValue << cShift) | (uValue >> (16 - cShift));
2863}
2864#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2865EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2866#endif
2867EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2868EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2869
2870DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2871{
2872 return (uValue << cShift) | (uValue >> (8 - cShift));
2873}
2874#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2875EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2876#endif
2877EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2878EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2879
2880
2881/*
2882 * ROR
2883 */
2884#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2885IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2886{ \
2887 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2888 if (cShift) \
2889 { \
2890 if (a_cBitsWidth < 32) \
2891 cShift &= a_cBitsWidth - 1; \
2892 a_uType const uDst = *puDst; \
2893 a_uType const uResult = a_fnHlp(uDst, cShift); \
2894 *puDst = uResult; \
2895 \
2896 /* Calc EFLAGS: */ \
2897 AssertCompile(X86_EFL_CF_BIT == 0); \
2898 uint32_t fEfl = *pfEFlags; \
2899 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2900 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2901 fEfl |= fCarry; \
2902 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2903 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2904 else /* Intel 10980XE: According to the first sub-shift: */ \
2905 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2906 *pfEFlags = fEfl; \
2907 } \
2908}
2909
2910#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2911EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2912#endif
2913EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2914EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2915
2916#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2917EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2918#endif
2919EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2920EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2921
2922DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2923{
2924 return (uValue >> cShift) | (uValue << (16 - cShift));
2925}
2926#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2927EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2928#endif
2929EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2930EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2931
2932DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2933{
2934 return (uValue >> cShift) | (uValue << (8 - cShift));
2935}
2936#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2937EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2938#endif
2939EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2940EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2941
2942
2943/*
2944 * RCL
2945 */
2946#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2947IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2948{ \
2949 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2950 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2951 cShift %= a_cBitsWidth + 1; \
2952 if (cShift) \
2953 { \
2954 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2955 cShift %= a_cBitsWidth + 1; \
2956 a_uType const uDst = *puDst; \
2957 a_uType uResult = uDst << cShift; \
2958 if (cShift > 1) \
2959 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2960 \
2961 AssertCompile(X86_EFL_CF_BIT == 0); \
2962 uint32_t fEfl = *pfEFlags; \
2963 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2964 uResult |= (a_uType)fInCarry << (cShift - 1); \
2965 \
2966 *puDst = uResult; \
2967 \
2968 /* Calc EFLAGS. */ \
2969 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2970 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2971 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2972 fEfl |= fOutCarry; \
2973 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2974 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
2975 else /* Intel 10980XE: According to the first sub-shift: */ \
2976 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2977 *pfEFlags = fEfl; \
2978 } \
2979}
2980
2981#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2982EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
2983#endif
2984EMIT_RCL(64, uint64_t, _intel, 1)
2985EMIT_RCL(64, uint64_t, _amd, 0)
2986
2987#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2988EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
2989#endif
2990EMIT_RCL(32, uint32_t, _intel, 1)
2991EMIT_RCL(32, uint32_t, _amd, 0)
2992
2993#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2994EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
2995#endif
2996EMIT_RCL(16, uint16_t, _intel, 1)
2997EMIT_RCL(16, uint16_t, _amd, 0)
2998
2999#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3000EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3001#endif
3002EMIT_RCL(8, uint8_t, _intel, 1)
3003EMIT_RCL(8, uint8_t, _amd, 0)
3004
3005
3006/*
3007 * RCR
3008 */
3009#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3010IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3011{ \
3012 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3013 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3014 cShift %= a_cBitsWidth + 1; \
3015 if (cShift) \
3016 { \
3017 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3018 cShift %= a_cBitsWidth + 1; \
3019 a_uType const uDst = *puDst; \
3020 a_uType uResult = uDst >> cShift; \
3021 if (cShift > 1) \
3022 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3023 \
3024 AssertCompile(X86_EFL_CF_BIT == 0); \
3025 uint32_t fEfl = *pfEFlags; \
3026 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3027 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3028 *puDst = uResult; \
3029 \
3030 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3031 it the same way as for 1 bit shifts. */ \
3032 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3033 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3034 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3035 fEfl |= fOutCarry; \
3036 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3037 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3038 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3039 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3040 *pfEFlags = fEfl; \
3041 } \
3042}
3043
3044#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3045EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3046#endif
3047EMIT_RCR(64, uint64_t, _intel, 1)
3048EMIT_RCR(64, uint64_t, _amd, 0)
3049
3050#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3051EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3052#endif
3053EMIT_RCR(32, uint32_t, _intel, 1)
3054EMIT_RCR(32, uint32_t, _amd, 0)
3055
3056#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3057EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3058#endif
3059EMIT_RCR(16, uint16_t, _intel, 1)
3060EMIT_RCR(16, uint16_t, _amd, 0)
3061
3062#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3063EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3064#endif
3065EMIT_RCR(8, uint8_t, _intel, 1)
3066EMIT_RCR(8, uint8_t, _amd, 0)
3067
3068
3069/*
3070 * SHL
3071 */
3072#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3073IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3074{ \
3075 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3076 if (cShift) \
3077 { \
3078 a_uType const uDst = *puDst; \
3079 a_uType uResult = uDst << cShift; \
3080 *puDst = uResult; \
3081 \
3082 /* Calc EFLAGS. */ \
3083 AssertCompile(X86_EFL_CF_BIT == 0); \
3084 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3085 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3086 fEfl |= fCarry; \
3087 if (!a_fIntelFlags) \
3088 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3089 else \
3090 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3091 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3092 fEfl |= X86_EFL_CALC_ZF(uResult); \
3093 fEfl |= g_afParity[uResult & 0xff]; \
3094 if (!a_fIntelFlags) \
3095 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3096 *pfEFlags = fEfl; \
3097 } \
3098}
3099
3100#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3101EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3102#endif
3103EMIT_SHL(64, uint64_t, _intel, 1)
3104EMIT_SHL(64, uint64_t, _amd, 0)
3105
3106#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3107EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3108#endif
3109EMIT_SHL(32, uint32_t, _intel, 1)
3110EMIT_SHL(32, uint32_t, _amd, 0)
3111
3112#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3113EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3114#endif
3115EMIT_SHL(16, uint16_t, _intel, 1)
3116EMIT_SHL(16, uint16_t, _amd, 0)
3117
3118#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3119EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3120#endif
3121EMIT_SHL(8, uint8_t, _intel, 1)
3122EMIT_SHL(8, uint8_t, _amd, 0)
3123
3124
3125/*
3126 * SHR
3127 */
3128#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3129IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3130{ \
3131 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3132 if (cShift) \
3133 { \
3134 a_uType const uDst = *puDst; \
3135 a_uType uResult = uDst >> cShift; \
3136 *puDst = uResult; \
3137 \
3138 /* Calc EFLAGS. */ \
3139 AssertCompile(X86_EFL_CF_BIT == 0); \
3140 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3141 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3142 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3143 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3144 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3145 fEfl |= X86_EFL_CALC_ZF(uResult); \
3146 fEfl |= g_afParity[uResult & 0xff]; \
3147 if (!a_fIntelFlags) \
3148 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3149 *pfEFlags = fEfl; \
3150 } \
3151}
3152
3153#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3154EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3155#endif
3156EMIT_SHR(64, uint64_t, _intel, 1)
3157EMIT_SHR(64, uint64_t, _amd, 0)
3158
3159#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3160EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3161#endif
3162EMIT_SHR(32, uint32_t, _intel, 1)
3163EMIT_SHR(32, uint32_t, _amd, 0)
3164
3165#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3166EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3167#endif
3168EMIT_SHR(16, uint16_t, _intel, 1)
3169EMIT_SHR(16, uint16_t, _amd, 0)
3170
3171#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3172EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3173#endif
3174EMIT_SHR(8, uint8_t, _intel, 1)
3175EMIT_SHR(8, uint8_t, _amd, 0)
3176
3177
3178/*
3179 * SAR
3180 */
3181#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3182IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3183{ \
3184 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3185 if (cShift) \
3186 { \
3187 a_iType const iDst = (a_iType)*puDst; \
3188 a_uType uResult = iDst >> cShift; \
3189 *puDst = uResult; \
3190 \
3191 /* Calc EFLAGS. \
3192 Note! The OF flag is always zero because the result never differs from the input. */ \
3193 AssertCompile(X86_EFL_CF_BIT == 0); \
3194 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3195 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3196 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3197 fEfl |= X86_EFL_CALC_ZF(uResult); \
3198 fEfl |= g_afParity[uResult & 0xff]; \
3199 if (!a_fIntelFlags) \
3200 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3201 *pfEFlags = fEfl; \
3202 } \
3203}
3204
3205#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3206EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3207#endif
3208EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3209EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3210
3211#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3212EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3213#endif
3214EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3215EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3216
3217#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3218EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3219#endif
3220EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3221EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3222
3223#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3224EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3225#endif
3226EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3227EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3228
3229
3230/*
3231 * SHLD
3232 *
3233 * - CF is the last bit shifted out of puDst.
3234 * - AF is always cleared by Intel 10980XE.
3235 * - AF is always set by AMD 3990X.
3236 * - OF is set according to the first shift on Intel 10980XE, it seems.
3237 * - OF is set according to the last sub-shift on AMD 3990X.
3238 * - ZF, SF and PF are calculated according to the result by both vendors.
3239 *
3240 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3241 * pick either the source register or the destination register for input bits
3242 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3243 * intel has changed behaviour here several times. We implement what current
3244 * skylake based does for now, we can extend this later as needed.
3245 */
3246#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3247IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3248 uint32_t *pfEFlags)) \
3249{ \
3250 cShift &= a_cBitsWidth - 1; \
3251 if (cShift) \
3252 { \
3253 a_uType const uDst = *puDst; \
3254 a_uType uResult = uDst << cShift; \
3255 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3256 *puDst = uResult; \
3257 \
3258 /* CALC EFLAGS: */ \
3259 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3260 if (a_fIntelFlags) \
3261 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3262 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3263 else \
3264 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3265 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3266 fEfl |= X86_EFL_AF; \
3267 } \
3268 AssertCompile(X86_EFL_CF_BIT == 0); \
3269 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3270 fEfl |= g_afParity[uResult & 0xff]; \
3271 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3272 fEfl |= X86_EFL_CALC_ZF(uResult); \
3273 *pfEFlags = fEfl; \
3274 } \
3275}
3276
3277#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3278EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3279#endif
3280EMIT_SHLD(64, uint64_t, _intel, 1)
3281EMIT_SHLD(64, uint64_t, _amd, 0)
3282
3283#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3284EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3285#endif
3286EMIT_SHLD(32, uint32_t, _intel, 1)
3287EMIT_SHLD(32, uint32_t, _amd, 0)
3288
3289#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3290IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3291{ \
3292 cShift &= 31; \
3293 if (cShift) \
3294 { \
3295 uint16_t const uDst = *puDst; \
3296 uint64_t const uTmp = a_fIntelFlags \
3297 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3298 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3299 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3300 *puDst = uResult; \
3301 \
3302 /* CALC EFLAGS: */ \
3303 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3304 AssertCompile(X86_EFL_CF_BIT == 0); \
3305 if (a_fIntelFlags) \
3306 { \
3307 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3308 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3309 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3310 } \
3311 else \
3312 { \
3313 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3314 if (cShift < 16) \
3315 { \
3316 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3317 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3318 } \
3319 else \
3320 { \
3321 if (cShift == 16) \
3322 fEfl |= uDst & X86_EFL_CF; \
3323 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3324 } \
3325 fEfl |= X86_EFL_AF; \
3326 } \
3327 fEfl |= g_afParity[uResult & 0xff]; \
3328 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3329 fEfl |= X86_EFL_CALC_ZF(uResult); \
3330 *pfEFlags = fEfl; \
3331 } \
3332}
3333
3334#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3335EMIT_SHLD_16(RT_NOTHING, 1)
3336#endif
3337EMIT_SHLD_16(_intel, 1)
3338EMIT_SHLD_16(_amd, 0)
3339
3340
3341/*
3342 * SHRD
3343 *
3344 * EFLAGS behaviour seems to be the same as with SHLD:
3345 * - CF is the last bit shifted out of puDst.
3346 * - AF is always cleared by Intel 10980XE.
3347 * - AF is always set by AMD 3990X.
3348 * - OF is set according to the first shift on Intel 10980XE, it seems.
3349 * - OF is set according to the last sub-shift on AMD 3990X.
3350 * - ZF, SF and PF are calculated according to the result by both vendors.
3351 *
3352 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3353 * pick either the source register or the destination register for input bits
3354 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3355 * intel has changed behaviour here several times. We implement what current
3356 * skylake based does for now, we can extend this later as needed.
3357 */
3358#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3359IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3360{ \
3361 cShift &= a_cBitsWidth - 1; \
3362 if (cShift) \
3363 { \
3364 a_uType const uDst = *puDst; \
3365 a_uType uResult = uDst >> cShift; \
3366 uResult |= uSrc << (a_cBitsWidth - cShift); \
3367 *puDst = uResult; \
3368 \
3369 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3370 AssertCompile(X86_EFL_CF_BIT == 0); \
3371 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3372 if (a_fIntelFlags) \
3373 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3374 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3375 else \
3376 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3377 if (cShift > 1) /* Set according to last shift. */ \
3378 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3379 else \
3380 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3381 fEfl |= X86_EFL_AF; \
3382 } \
3383 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3384 fEfl |= X86_EFL_CALC_ZF(uResult); \
3385 fEfl |= g_afParity[uResult & 0xff]; \
3386 *pfEFlags = fEfl; \
3387 } \
3388}
3389
3390#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3391EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3392#endif
3393EMIT_SHRD(64, uint64_t, _intel, 1)
3394EMIT_SHRD(64, uint64_t, _amd, 0)
3395
3396#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3397EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3398#endif
3399EMIT_SHRD(32, uint32_t, _intel, 1)
3400EMIT_SHRD(32, uint32_t, _amd, 0)
3401
3402#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3403IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3404{ \
3405 cShift &= 31; \
3406 if (cShift) \
3407 { \
3408 uint16_t const uDst = *puDst; \
3409 uint64_t const uTmp = a_fIntelFlags \
3410 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3411 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3412 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3413 *puDst = uResult; \
3414 \
3415 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3416 AssertCompile(X86_EFL_CF_BIT == 0); \
3417 if (a_fIntelFlags) \
3418 { \
3419 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3420 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3421 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3422 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3423 } \
3424 else \
3425 { \
3426 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3427 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3428 /* AMD 3990X: Set according to last shift. AF always set. */ \
3429 if (cShift > 1) /* Set according to last shift. */ \
3430 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3431 else \
3432 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3433 fEfl |= X86_EFL_AF; \
3434 } \
3435 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3436 fEfl |= X86_EFL_CALC_ZF(uResult); \
3437 fEfl |= g_afParity[uResult & 0xff]; \
3438 *pfEFlags = fEfl; \
3439 } \
3440}
3441
3442#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3443EMIT_SHRD_16(RT_NOTHING, 1)
3444#endif
3445EMIT_SHRD_16(_intel, 1)
3446EMIT_SHRD_16(_amd, 0)
3447
3448
3449/*
3450 * RORX (BMI2)
3451 */
3452#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3453IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3454{ \
3455 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3456}
3457
3458#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3459EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3460#endif
3461#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3462EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3463#endif
3464
3465
3466/*
3467 * SHLX (BMI2)
3468 */
3469#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3470IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3471{ \
3472 cShift &= a_cBitsWidth - 1; \
3473 *puDst = uSrc << cShift; \
3474}
3475
3476#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3477EMIT_SHLX(64, uint64_t, RT_NOTHING)
3478EMIT_SHLX(64, uint64_t, _fallback)
3479#endif
3480#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3481EMIT_SHLX(32, uint32_t, RT_NOTHING)
3482EMIT_SHLX(32, uint32_t, _fallback)
3483#endif
3484
3485
3486/*
3487 * SHRX (BMI2)
3488 */
3489#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3490IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3491{ \
3492 cShift &= a_cBitsWidth - 1; \
3493 *puDst = uSrc >> cShift; \
3494}
3495
3496#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3497EMIT_SHRX(64, uint64_t, RT_NOTHING)
3498EMIT_SHRX(64, uint64_t, _fallback)
3499#endif
3500#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3501EMIT_SHRX(32, uint32_t, RT_NOTHING)
3502EMIT_SHRX(32, uint32_t, _fallback)
3503#endif
3504
3505
3506/*
3507 * SARX (BMI2)
3508 */
3509#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3510IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3511{ \
3512 cShift &= a_cBitsWidth - 1; \
3513 *puDst = (a_iType)uSrc >> cShift; \
3514}
3515
3516#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3517EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3518EMIT_SARX(64, uint64_t, int64_t, _fallback)
3519#endif
3520#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3521EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3522EMIT_SARX(32, uint32_t, int32_t, _fallback)
3523#endif
3524
3525
3526/*
3527 * PDEP (BMI2)
3528 */
3529#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3530IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3531{ \
3532 a_uType uResult = 0; \
3533 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3534 if (fMask & ((a_uType)1 << iMaskBit)) \
3535 { \
3536 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3537 iBit++; \
3538 } \
3539 *puDst = uResult; \
3540}
3541
3542#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3543EMIT_PDEP(64, uint64_t, RT_NOTHING)
3544#endif
3545EMIT_PDEP(64, uint64_t, _fallback)
3546#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3547EMIT_PDEP(32, uint32_t, RT_NOTHING)
3548#endif
3549EMIT_PDEP(32, uint32_t, _fallback)
3550
3551/*
3552 * PEXT (BMI2)
3553 */
3554#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PEXT(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PEXT(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PEXT(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PEXT(32, uint32_t, _fallback)
3575
3576
3577#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3578
3579# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3580/*
3581 * BSWAP
3582 */
3583
3584IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3585{
3586 *puDst = ASMByteSwapU64(*puDst);
3587}
3588
3589
3590IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3591{
3592 *puDst = ASMByteSwapU32(*puDst);
3593}
3594
3595
3596/* Note! undocument, so 32-bit arg */
3597IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3598{
3599#if 0
3600 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3601#else
3602 /* This is the behaviour AMD 3990x (64-bit mode): */
3603 *(uint16_t *)puDst = 0;
3604#endif
3605}
3606
3607# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3608
3609
3610
3611# if defined(IEM_WITHOUT_ASSEMBLY)
3612
3613/*
3614 * LFENCE, SFENCE & MFENCE.
3615 */
3616
3617IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3618{
3619 ASMReadFence();
3620}
3621
3622
3623IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3624{
3625 ASMWriteFence();
3626}
3627
3628
3629IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3630{
3631 ASMMemoryFence();
3632}
3633
3634
3635# ifndef RT_ARCH_ARM64
3636IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3637{
3638 ASMMemoryFence();
3639}
3640# endif
3641
3642# endif
3643
3644#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3645
3646
3647IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3648{
3649 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3650 {
3651 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3652 *pu16Dst |= u16Src & X86_SEL_RPL;
3653
3654 *pfEFlags |= X86_EFL_ZF;
3655 }
3656 else
3657 *pfEFlags &= ~X86_EFL_ZF;
3658}
3659
3660
3661#if defined(IEM_WITHOUT_ASSEMBLY)
3662
3663/*********************************************************************************************************************************
3664* x87 FPU Loads *
3665*********************************************************************************************************************************/
3666
3667IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3668{
3669 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3670 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3671 {
3672 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3673 pFpuRes->r80Result.sj64.fInteger = 1;
3674 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3675 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3676 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3677 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3678 }
3679 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3680 {
3681 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3682 pFpuRes->r80Result.s.uExponent = 0;
3683 pFpuRes->r80Result.s.uMantissa = 0;
3684 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3685 }
3686 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3687 {
3688 /* Subnormal values gets normalized. */
3689 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3690 pFpuRes->r80Result.sj64.fInteger = 1;
3691 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3692 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3693 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3694 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3695 pFpuRes->FSW |= X86_FSW_DE;
3696 if (!(pFpuState->FCW & X86_FCW_DM))
3697 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3698 }
3699 else if (RTFLOAT32U_IS_INF(pr32Val))
3700 {
3701 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3702 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3703 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3704 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3705 }
3706 else
3707 {
3708 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3709 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3710 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3711 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3712 pFpuRes->r80Result.sj64.fInteger = 1;
3713 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3714 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3715 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3716 {
3717 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3718 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3719 pFpuRes->FSW |= X86_FSW_IE;
3720
3721 if (!(pFpuState->FCW & X86_FCW_IM))
3722 {
3723 /* The value is not pushed. */
3724 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3725 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3726 pFpuRes->r80Result.au64[0] = 0;
3727 pFpuRes->r80Result.au16[4] = 0;
3728 }
3729 }
3730 else
3731 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3732 }
3733}
3734
3735
3736IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3737{
3738 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3739 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3740 {
3741 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3742 pFpuRes->r80Result.sj64.fInteger = 1;
3743 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3744 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3745 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3746 }
3747 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3748 {
3749 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3750 pFpuRes->r80Result.s.uExponent = 0;
3751 pFpuRes->r80Result.s.uMantissa = 0;
3752 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3753 }
3754 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3755 {
3756 /* Subnormal values gets normalized. */
3757 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3758 pFpuRes->r80Result.sj64.fInteger = 1;
3759 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3760 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3761 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3762 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3763 pFpuRes->FSW |= X86_FSW_DE;
3764 if (!(pFpuState->FCW & X86_FCW_DM))
3765 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3766 }
3767 else if (RTFLOAT64U_IS_INF(pr64Val))
3768 {
3769 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3770 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3771 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3772 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3773 }
3774 else
3775 {
3776 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3777 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3778 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3779 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3780 pFpuRes->r80Result.sj64.fInteger = 1;
3781 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3782 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3783 {
3784 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3785 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3786 pFpuRes->FSW |= X86_FSW_IE;
3787
3788 if (!(pFpuState->FCW & X86_FCW_IM))
3789 {
3790 /* The value is not pushed. */
3791 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3792 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3793 pFpuRes->r80Result.au64[0] = 0;
3794 pFpuRes->r80Result.au16[4] = 0;
3795 }
3796 }
3797 else
3798 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3799 }
3800}
3801
3802
3803IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3804{
3805 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3806 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3807 /* Raises no exceptions. */
3808 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3809}
3810
3811
3812IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3813{
3814 pFpuRes->r80Result.sj64.fSign = 0;
3815 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3816 pFpuRes->r80Result.sj64.fInteger = 1;
3817 pFpuRes->r80Result.sj64.uFraction = 0;
3818
3819 /*
3820 * FPU status word:
3821 * - TOP is irrelevant, but we must match x86 assembly version.
3822 * - C1 is always cleared as we don't have any stack overflows.
3823 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3824 */
3825 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3826}
3827
3828
3829IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3830{
3831 pFpuRes->r80Result.sj64.fSign = 0;
3832 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3833 pFpuRes->r80Result.sj64.fInteger = 1;
3834 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3835 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3836 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3837 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3838}
3839
3840
3841IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3842{
3843 pFpuRes->r80Result.sj64.fSign = 0;
3844 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3845 pFpuRes->r80Result.sj64.fInteger = 1;
3846 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3847 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3848 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3849}
3850
3851
3852IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3853{
3854 pFpuRes->r80Result.sj64.fSign = 0;
3855 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3856 pFpuRes->r80Result.sj64.fInteger = 1;
3857 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3858 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3859 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3860 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3861}
3862
3863
3864IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3865{
3866 pFpuRes->r80Result.sj64.fSign = 0;
3867 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3868 pFpuRes->r80Result.sj64.fInteger = 1;
3869 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3870 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3871 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3872 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3873}
3874
3875
3876IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3877{
3878 pFpuRes->r80Result.sj64.fSign = 0;
3879 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3880 pFpuRes->r80Result.sj64.fInteger = 1;
3881 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3882 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3883 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3884 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3885}
3886
3887
3888IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3889{
3890 pFpuRes->r80Result.s.fSign = 0;
3891 pFpuRes->r80Result.s.uExponent = 0;
3892 pFpuRes->r80Result.s.uMantissa = 0;
3893 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3894}
3895
3896#define EMIT_FILD(a_cBits) \
3897IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3898 int ## a_cBits ## _t const *piVal)) \
3899{ \
3900 int ## a_cBits ## _t iVal = *piVal; \
3901 if (iVal == 0) \
3902 { \
3903 pFpuRes->r80Result.s.fSign = 0; \
3904 pFpuRes->r80Result.s.uExponent = 0; \
3905 pFpuRes->r80Result.s.uMantissa = 0; \
3906 } \
3907 else \
3908 { \
3909 if (iVal > 0) \
3910 pFpuRes->r80Result.s.fSign = 0; \
3911 else \
3912 { \
3913 pFpuRes->r80Result.s.fSign = 1; \
3914 iVal = -iVal; \
3915 } \
3916 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3917 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3918 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3919 } \
3920 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3921}
3922EMIT_FILD(16)
3923EMIT_FILD(32)
3924EMIT_FILD(64)
3925
3926
3927IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3928{
3929 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3930 if ( pd80Val->s.abPairs[0] == 0
3931 && pd80Val->s.abPairs[1] == 0
3932 && pd80Val->s.abPairs[2] == 0
3933 && pd80Val->s.abPairs[3] == 0
3934 && pd80Val->s.abPairs[4] == 0
3935 && pd80Val->s.abPairs[5] == 0
3936 && pd80Val->s.abPairs[6] == 0
3937 && pd80Val->s.abPairs[7] == 0
3938 && pd80Val->s.abPairs[8] == 0)
3939 {
3940 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3941 pFpuRes->r80Result.s.uExponent = 0;
3942 pFpuRes->r80Result.s.uMantissa = 0;
3943 }
3944 else
3945 {
3946 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3947
3948 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3949 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3950 cPairs--;
3951
3952 uint64_t uVal = 0;
3953 uint64_t uFactor = 1;
3954 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3955 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3956 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3957
3958 unsigned const cBits = ASMBitLastSetU64(uVal);
3959 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3960 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3961 }
3962}
3963
3964
3965/*********************************************************************************************************************************
3966* x87 FPU Stores *
3967*********************************************************************************************************************************/
3968
3969/**
3970 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3971 *
3972 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3973 *
3974 * @returns Updated FPU status word value.
3975 * @param fSignIn Incoming sign indicator.
3976 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
3977 * @param iExponentIn Unbiased exponent.
3978 * @param fFcw The FPU control word.
3979 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
3980 * @param pr32Dst Where to return the output value, if one should be
3981 * returned.
3982 *
3983 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
3984 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
3985 */
3986static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
3987 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
3988{
3989 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
3990 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3991 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
3992 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
3993 ? fRoundingOffMask
3994 : 0;
3995 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
3996
3997 /*
3998 * Deal with potential overflows/underflows first, optimizing for none.
3999 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4000 */
4001 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4002 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4003 { /* likely? */ }
4004 /*
4005 * Underflow if the exponent zero or negative. This is attempted mapped
4006 * to a subnormal number when possible, with some additional trickery ofc.
4007 */
4008 else if (iExponentOut <= 0)
4009 {
4010 bool const fIsTiny = iExponentOut < 0
4011 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4012 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4013 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4014 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4015
4016 if (iExponentOut <= 0)
4017 {
4018 uMantissaIn = iExponentOut <= -63
4019 ? uMantissaIn != 0
4020 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4021 fRoundedOff = uMantissaIn & fRoundingOffMask;
4022 if (fRoundedOff && fIsTiny)
4023 fFsw |= X86_FSW_UE;
4024 iExponentOut = 0;
4025 }
4026 }
4027 /*
4028 * Overflow if at or above max exponent value or if we will reach max
4029 * when rounding. Will return +/-zero or +/-max value depending on
4030 * whether we're rounding or not.
4031 */
4032 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4033 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4034 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4035 {
4036 fFsw |= X86_FSW_OE;
4037 if (!(fFcw & X86_FCW_OM))
4038 return fFsw | X86_FSW_ES | X86_FSW_B;
4039 fFsw |= X86_FSW_PE;
4040 if (uRoundingAdd)
4041 fFsw |= X86_FSW_C1;
4042 if (!(fFcw & X86_FCW_PM))
4043 fFsw |= X86_FSW_ES | X86_FSW_B;
4044
4045 pr32Dst->s.fSign = fSignIn;
4046 if (uRoundingAdd)
4047 { /* Zero */
4048 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4049 pr32Dst->s.uFraction = 0;
4050 }
4051 else
4052 { /* Max */
4053 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4054 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4055 }
4056 return fFsw;
4057 }
4058
4059 /*
4060 * Normal or subnormal number.
4061 */
4062 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4063 uint64_t uMantissaOut = uMantissaIn;
4064 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4065 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4066 || fRoundedOff != uRoundingAdd)
4067 {
4068 uMantissaOut = uMantissaIn + uRoundingAdd;
4069 if (uMantissaOut >= uMantissaIn)
4070 { /* likely */ }
4071 else
4072 {
4073 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4074 iExponentOut++;
4075 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4076 fFsw |= X86_FSW_C1;
4077 }
4078 }
4079 else
4080 uMantissaOut = uMantissaIn;
4081
4082 /* Truncate the mantissa and set the return value. */
4083 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4084
4085 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4086 pr32Dst->s.uExponent = iExponentOut;
4087 pr32Dst->s.fSign = fSignIn;
4088
4089 /* Set status flags realted to rounding. */
4090 if (fRoundedOff)
4091 {
4092 fFsw |= X86_FSW_PE;
4093 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4094 fFsw |= X86_FSW_C1;
4095 if (!(fFcw & X86_FCW_PM))
4096 fFsw |= X86_FSW_ES | X86_FSW_B;
4097 }
4098
4099 return fFsw;
4100}
4101
4102
4103/**
4104 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4105 */
4106IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4107 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4108{
4109 uint16_t const fFcw = pFpuState->FCW;
4110 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4111 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4112 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4113 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4114 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4115 {
4116 pr32Dst->s.fSign = pr80Src->s.fSign;
4117 pr32Dst->s.uExponent = 0;
4118 pr32Dst->s.uFraction = 0;
4119 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4120 }
4121 else if (RTFLOAT80U_IS_INF(pr80Src))
4122 {
4123 pr32Dst->s.fSign = pr80Src->s.fSign;
4124 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4125 pr32Dst->s.uFraction = 0;
4126 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4127 }
4128 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4129 {
4130 /* Mapped to +/-QNaN */
4131 pr32Dst->s.fSign = pr80Src->s.fSign;
4132 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4133 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4134 }
4135 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4136 {
4137 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4138 if (fFcw & X86_FCW_IM)
4139 {
4140 pr32Dst->s.fSign = 1;
4141 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4142 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4143 fFsw |= X86_FSW_IE;
4144 }
4145 else
4146 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4147 }
4148 else if (RTFLOAT80U_IS_NAN(pr80Src))
4149 {
4150 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4151 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4152 {
4153 pr32Dst->s.fSign = pr80Src->s.fSign;
4154 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4155 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4156 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4157 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4158 fFsw |= X86_FSW_IE;
4159 }
4160 else
4161 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4162 }
4163 else
4164 {
4165 /* Denormal values causes both an underflow and precision exception. */
4166 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4167 if (fFcw & X86_FCW_UM)
4168 {
4169 pr32Dst->s.fSign = pr80Src->s.fSign;
4170 pr32Dst->s.uExponent = 0;
4171 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4172 {
4173 pr32Dst->s.uFraction = 1;
4174 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4175 if (!(fFcw & X86_FCW_PM))
4176 fFsw |= X86_FSW_ES | X86_FSW_B;
4177 }
4178 else
4179 {
4180 pr32Dst->s.uFraction = 0;
4181 fFsw |= X86_FSW_UE | X86_FSW_PE;
4182 if (!(fFcw & X86_FCW_PM))
4183 fFsw |= X86_FSW_ES | X86_FSW_B;
4184 }
4185 }
4186 else
4187 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4188 }
4189 *pu16FSW = fFsw;
4190}
4191
4192
4193/**
4194 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4195 *
4196 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4197 *
4198 * @returns Updated FPU status word value.
4199 * @param fSignIn Incoming sign indicator.
4200 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4201 * @param iExponentIn Unbiased exponent.
4202 * @param fFcw The FPU control word.
4203 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4204 * @param pr64Dst Where to return the output value, if one should be
4205 * returned.
4206 *
4207 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4208 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4209 */
4210static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4211 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4212{
4213 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4214 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4215 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4216 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4217 ? fRoundingOffMask
4218 : 0;
4219 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4220
4221 /*
4222 * Deal with potential overflows/underflows first, optimizing for none.
4223 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4224 */
4225 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4226 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4227 { /* likely? */ }
4228 /*
4229 * Underflow if the exponent zero or negative. This is attempted mapped
4230 * to a subnormal number when possible, with some additional trickery ofc.
4231 */
4232 else if (iExponentOut <= 0)
4233 {
4234 bool const fIsTiny = iExponentOut < 0
4235 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4236 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4237 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4238 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4239
4240 if (iExponentOut <= 0)
4241 {
4242 uMantissaIn = iExponentOut <= -63
4243 ? uMantissaIn != 0
4244 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4245 fRoundedOff = uMantissaIn & fRoundingOffMask;
4246 if (fRoundedOff && fIsTiny)
4247 fFsw |= X86_FSW_UE;
4248 iExponentOut = 0;
4249 }
4250 }
4251 /*
4252 * Overflow if at or above max exponent value or if we will reach max
4253 * when rounding. Will return +/-zero or +/-max value depending on
4254 * whether we're rounding or not.
4255 */
4256 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4257 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4258 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4259 {
4260 fFsw |= X86_FSW_OE;
4261 if (!(fFcw & X86_FCW_OM))
4262 return fFsw | X86_FSW_ES | X86_FSW_B;
4263 fFsw |= X86_FSW_PE;
4264 if (uRoundingAdd)
4265 fFsw |= X86_FSW_C1;
4266 if (!(fFcw & X86_FCW_PM))
4267 fFsw |= X86_FSW_ES | X86_FSW_B;
4268
4269 pr64Dst->s64.fSign = fSignIn;
4270 if (uRoundingAdd)
4271 { /* Zero */
4272 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4273 pr64Dst->s64.uFraction = 0;
4274 }
4275 else
4276 { /* Max */
4277 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4278 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4279 }
4280 return fFsw;
4281 }
4282
4283 /*
4284 * Normal or subnormal number.
4285 */
4286 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4287 uint64_t uMantissaOut = uMantissaIn;
4288 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4289 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4290 || fRoundedOff != uRoundingAdd)
4291 {
4292 uMantissaOut = uMantissaIn + uRoundingAdd;
4293 if (uMantissaOut >= uMantissaIn)
4294 { /* likely */ }
4295 else
4296 {
4297 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4298 iExponentOut++;
4299 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4300 fFsw |= X86_FSW_C1;
4301 }
4302 }
4303 else
4304 uMantissaOut = uMantissaIn;
4305
4306 /* Truncate the mantissa and set the return value. */
4307 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4308
4309 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4310 pr64Dst->s64.uExponent = iExponentOut;
4311 pr64Dst->s64.fSign = fSignIn;
4312
4313 /* Set status flags realted to rounding. */
4314 if (fRoundedOff)
4315 {
4316 fFsw |= X86_FSW_PE;
4317 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4318 fFsw |= X86_FSW_C1;
4319 if (!(fFcw & X86_FCW_PM))
4320 fFsw |= X86_FSW_ES | X86_FSW_B;
4321 }
4322
4323 return fFsw;
4324}
4325
4326
4327/**
4328 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4329 */
4330IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4331 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4332{
4333 uint16_t const fFcw = pFpuState->FCW;
4334 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4335 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4336 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4337 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4338 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4339 {
4340 pr64Dst->s64.fSign = pr80Src->s.fSign;
4341 pr64Dst->s64.uExponent = 0;
4342 pr64Dst->s64.uFraction = 0;
4343 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4344 }
4345 else if (RTFLOAT80U_IS_INF(pr80Src))
4346 {
4347 pr64Dst->s64.fSign = pr80Src->s.fSign;
4348 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4349 pr64Dst->s64.uFraction = 0;
4350 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4351 }
4352 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4353 {
4354 /* Mapped to +/-QNaN */
4355 pr64Dst->s64.fSign = pr80Src->s.fSign;
4356 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4357 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4358 }
4359 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4360 {
4361 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4362 if (fFcw & X86_FCW_IM)
4363 {
4364 pr64Dst->s64.fSign = 1;
4365 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4366 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4367 fFsw |= X86_FSW_IE;
4368 }
4369 else
4370 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4371 }
4372 else if (RTFLOAT80U_IS_NAN(pr80Src))
4373 {
4374 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4375 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4376 {
4377 pr64Dst->s64.fSign = pr80Src->s.fSign;
4378 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4379 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4380 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4381 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4382 fFsw |= X86_FSW_IE;
4383 }
4384 else
4385 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4386 }
4387 else
4388 {
4389 /* Denormal values causes both an underflow and precision exception. */
4390 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4391 if (fFcw & X86_FCW_UM)
4392 {
4393 pr64Dst->s64.fSign = pr80Src->s.fSign;
4394 pr64Dst->s64.uExponent = 0;
4395 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4396 {
4397 pr64Dst->s64.uFraction = 1;
4398 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4399 if (!(fFcw & X86_FCW_PM))
4400 fFsw |= X86_FSW_ES | X86_FSW_B;
4401 }
4402 else
4403 {
4404 pr64Dst->s64.uFraction = 0;
4405 fFsw |= X86_FSW_UE | X86_FSW_PE;
4406 if (!(fFcw & X86_FCW_PM))
4407 fFsw |= X86_FSW_ES | X86_FSW_B;
4408 }
4409 }
4410 else
4411 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4412 }
4413 *pu16FSW = fFsw;
4414}
4415
4416
4417IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4418 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4419{
4420 /*
4421 * FPU status word:
4422 * - TOP is irrelevant, but we must match x86 assembly version (0).
4423 * - C1 is always cleared as we don't have any stack overflows.
4424 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4425 */
4426 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4427 *pr80Dst = *pr80Src;
4428}
4429
4430
4431/*
4432 *
4433 * Mantissa:
4434 * 63 56 48 40 32 24 16 8 0
4435 * v v v v v v v v v
4436 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4437 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4438 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4439 *
4440 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4441 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4442 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4443 * where we'll drop off all but bit 63.
4444 */
4445#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4446IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4447 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4448{ \
4449 uint16_t const fFcw = pFpuState->FCW; \
4450 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4451 bool const fSignIn = pr80Val->s.fSign; \
4452 \
4453 /* \
4454 * Deal with normal numbers first. \
4455 */ \
4456 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4457 { \
4458 uint64_t uMantissa = pr80Val->s.uMantissa; \
4459 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4460 \
4461 if ((uint32_t)iExponent <= a_cBits - 2) \
4462 { \
4463 unsigned const cShiftOff = 63 - iExponent; \
4464 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4465 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4466 ? RT_BIT_64(cShiftOff - 1) \
4467 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4468 ? fRoundingOffMask \
4469 : 0; \
4470 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4471 \
4472 uMantissa >>= cShiftOff; \
4473 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4474 uMantissa += uRounding; \
4475 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4476 { \
4477 if (fRoundedOff) \
4478 { \
4479 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4480 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4481 else if (uRounding) \
4482 fFsw |= X86_FSW_C1; \
4483 fFsw |= X86_FSW_PE; \
4484 if (!(fFcw & X86_FCW_PM)) \
4485 fFsw |= X86_FSW_ES | X86_FSW_B; \
4486 } \
4487 \
4488 if (!fSignIn) \
4489 *piDst = (a_iType)uMantissa; \
4490 else \
4491 *piDst = -(a_iType)uMantissa; \
4492 } \
4493 else \
4494 { \
4495 /* overflowed after rounding. */ \
4496 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4497 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4498 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4499 \
4500 /* Special case for the integer minimum value. */ \
4501 if (fSignIn) \
4502 { \
4503 *piDst = a_iTypeMin; \
4504 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4505 if (!(fFcw & X86_FCW_PM)) \
4506 fFsw |= X86_FSW_ES | X86_FSW_B; \
4507 } \
4508 else \
4509 { \
4510 fFsw |= X86_FSW_IE; \
4511 if (fFcw & X86_FCW_IM) \
4512 *piDst = a_iTypeMin; \
4513 else \
4514 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4515 } \
4516 } \
4517 } \
4518 /* \
4519 * Tiny sub-zero numbers. \
4520 */ \
4521 else if (iExponent < 0) \
4522 { \
4523 if (!fSignIn) \
4524 { \
4525 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4526 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4527 { \
4528 *piDst = 1; \
4529 fFsw |= X86_FSW_C1; \
4530 } \
4531 else \
4532 *piDst = 0; \
4533 } \
4534 else \
4535 { \
4536 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4537 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4538 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4539 *piDst = 0; \
4540 else \
4541 { \
4542 *piDst = -1; \
4543 fFsw |= X86_FSW_C1; \
4544 } \
4545 } \
4546 fFsw |= X86_FSW_PE; \
4547 if (!(fFcw & X86_FCW_PM)) \
4548 fFsw |= X86_FSW_ES | X86_FSW_B; \
4549 } \
4550 /* \
4551 * Special MIN case. \
4552 */ \
4553 else if ( fSignIn && iExponent == a_cBits - 1 \
4554 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4555 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4556 : uMantissa == RT_BIT_64(63))) \
4557 { \
4558 *piDst = a_iTypeMin; \
4559 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4560 { \
4561 fFsw |= X86_FSW_PE; \
4562 if (!(fFcw & X86_FCW_PM)) \
4563 fFsw |= X86_FSW_ES | X86_FSW_B; \
4564 } \
4565 } \
4566 /* \
4567 * Too large/small number outside the target integer range. \
4568 */ \
4569 else \
4570 { \
4571 fFsw |= X86_FSW_IE; \
4572 if (fFcw & X86_FCW_IM) \
4573 *piDst = a_iTypeIndefinite; \
4574 else \
4575 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4576 } \
4577 } \
4578 /* \
4579 * Map both +0 and -0 to integer zero (signless/+). \
4580 */ \
4581 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4582 *piDst = 0; \
4583 /* \
4584 * Denormals are just really tiny sub-zero numbers that are either rounded \
4585 * to zero, 1 or -1 depending on sign and rounding control. \
4586 */ \
4587 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4588 { \
4589 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4590 *piDst = 0; \
4591 else \
4592 { \
4593 *piDst = fSignIn ? -1 : 1; \
4594 fFsw |= X86_FSW_C1; \
4595 } \
4596 fFsw |= X86_FSW_PE; \
4597 if (!(fFcw & X86_FCW_PM)) \
4598 fFsw |= X86_FSW_ES | X86_FSW_B; \
4599 } \
4600 /* \
4601 * All other special values are considered invalid arguments and result \
4602 * in an IE exception and indefinite value if masked. \
4603 */ \
4604 else \
4605 { \
4606 fFsw |= X86_FSW_IE; \
4607 if (fFcw & X86_FCW_IM) \
4608 *piDst = a_iTypeIndefinite; \
4609 else \
4610 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4611 } \
4612 *pu16FSW = fFsw; \
4613}
4614EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4615EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4616EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4617
4618#endif /*IEM_WITHOUT_ASSEMBLY */
4619
4620
4621/*
4622 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4623 *
4624 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4625 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4626 * thus the @a a_cBitsIn.
4627 */
4628#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4629IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4630 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4631{ \
4632 uint16_t const fFcw = pFpuState->FCW; \
4633 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4634 bool const fSignIn = pr80Val->s.fSign; \
4635 \
4636 /* \
4637 * Deal with normal numbers first. \
4638 */ \
4639 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4640 { \
4641 uint64_t uMantissa = pr80Val->s.uMantissa; \
4642 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4643 \
4644 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4645 { \
4646 unsigned const cShiftOff = 63 - iExponent; \
4647 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4648 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4649 uMantissa >>= cShiftOff; \
4650 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4651 if (!fSignIn) \
4652 *piDst = (a_iType)uMantissa; \
4653 else \
4654 *piDst = -(a_iType)uMantissa; \
4655 \
4656 if (fRoundedOff) \
4657 { \
4658 fFsw |= X86_FSW_PE; \
4659 if (!(fFcw & X86_FCW_PM)) \
4660 fFsw |= X86_FSW_ES | X86_FSW_B; \
4661 } \
4662 } \
4663 /* \
4664 * Tiny sub-zero numbers. \
4665 */ \
4666 else if (iExponent < 0) \
4667 { \
4668 *piDst = 0; \
4669 fFsw |= X86_FSW_PE; \
4670 if (!(fFcw & X86_FCW_PM)) \
4671 fFsw |= X86_FSW_ES | X86_FSW_B; \
4672 } \
4673 /* \
4674 * Special MIN case. \
4675 */ \
4676 else if ( fSignIn && iExponent == a_cBits - 1 \
4677 && (a_cBits < 64 \
4678 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4679 : uMantissa == RT_BIT_64(63)) ) \
4680 { \
4681 *piDst = a_iTypeMin; \
4682 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4683 { \
4684 fFsw |= X86_FSW_PE; \
4685 if (!(fFcw & X86_FCW_PM)) \
4686 fFsw |= X86_FSW_ES | X86_FSW_B; \
4687 } \
4688 } \
4689 /* \
4690 * Figure this weirdness. \
4691 */ \
4692 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4693 { \
4694 *piDst = 0; \
4695 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4696 { \
4697 fFsw |= X86_FSW_PE; \
4698 if (!(fFcw & X86_FCW_PM)) \
4699 fFsw |= X86_FSW_ES | X86_FSW_B; \
4700 } \
4701 } \
4702 /* \
4703 * Too large/small number outside the target integer range. \
4704 */ \
4705 else \
4706 { \
4707 fFsw |= X86_FSW_IE; \
4708 if (fFcw & X86_FCW_IM) \
4709 *piDst = a_iTypeIndefinite; \
4710 else \
4711 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4712 } \
4713 } \
4714 /* \
4715 * Map both +0 and -0 to integer zero (signless/+). \
4716 */ \
4717 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4718 *piDst = 0; \
4719 /* \
4720 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4721 */ \
4722 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4723 { \
4724 *piDst = 0; \
4725 fFsw |= X86_FSW_PE; \
4726 if (!(fFcw & X86_FCW_PM)) \
4727 fFsw |= X86_FSW_ES | X86_FSW_B; \
4728 } \
4729 /* \
4730 * All other special values are considered invalid arguments and result \
4731 * in an IE exception and indefinite value if masked. \
4732 */ \
4733 else \
4734 { \
4735 fFsw |= X86_FSW_IE; \
4736 if (fFcw & X86_FCW_IM) \
4737 *piDst = a_iTypeIndefinite; \
4738 else \
4739 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4740 } \
4741 *pu16FSW = fFsw; \
4742}
4743#if defined(IEM_WITHOUT_ASSEMBLY)
4744EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4745EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4746EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4747#endif
4748EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4749EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4750
4751
4752#if defined(IEM_WITHOUT_ASSEMBLY)
4753
4754IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4755 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4756{
4757 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4758 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4759 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4760 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4761 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4762
4763 uint16_t const fFcw = pFpuState->FCW;
4764 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4765 bool const fSignIn = pr80Src->s.fSign;
4766
4767 /*
4768 * Deal with normal numbers first.
4769 */
4770 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4771 {
4772 uint64_t uMantissa = pr80Src->s.uMantissa;
4773 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4774 if ( (uint32_t)iExponent <= 58
4775 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4776 {
4777 unsigned const cShiftOff = 63 - iExponent;
4778 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4779 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4780 ? RT_BIT_64(cShiftOff - 1)
4781 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4782 ? fRoundingOffMask
4783 : 0;
4784 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4785
4786 uMantissa >>= cShiftOff;
4787 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4788 uMantissa += uRounding;
4789 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4790 {
4791 if (fRoundedOff)
4792 {
4793 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4794 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4795 else if (uRounding)
4796 fFsw |= X86_FSW_C1;
4797 fFsw |= X86_FSW_PE;
4798 if (!(fFcw & X86_FCW_PM))
4799 fFsw |= X86_FSW_ES | X86_FSW_B;
4800 }
4801
4802 pd80Dst->s.fSign = fSignIn;
4803 pd80Dst->s.uPad = 0;
4804 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4805 {
4806 unsigned const uDigits = uMantissa % 100;
4807 uMantissa /= 100;
4808 uint8_t const bLo = uDigits % 10;
4809 uint8_t const bHi = uDigits / 10;
4810 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4811 }
4812 }
4813 else
4814 {
4815 /* overflowed after rounding. */
4816 fFsw |= X86_FSW_IE;
4817 if (fFcw & X86_FCW_IM)
4818 *pd80Dst = s_d80Indefinite;
4819 else
4820 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4821 }
4822 }
4823 /*
4824 * Tiny sub-zero numbers.
4825 */
4826 else if (iExponent < 0)
4827 {
4828 if (!fSignIn)
4829 {
4830 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4831 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4832 {
4833 *pd80Dst = s_ad80One[fSignIn];
4834 fFsw |= X86_FSW_C1;
4835 }
4836 else
4837 *pd80Dst = s_ad80Zeros[fSignIn];
4838 }
4839 else
4840 {
4841 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4842 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4843 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4844 *pd80Dst = s_ad80Zeros[fSignIn];
4845 else
4846 {
4847 *pd80Dst = s_ad80One[fSignIn];
4848 fFsw |= X86_FSW_C1;
4849 }
4850 }
4851 fFsw |= X86_FSW_PE;
4852 if (!(fFcw & X86_FCW_PM))
4853 fFsw |= X86_FSW_ES | X86_FSW_B;
4854 }
4855 /*
4856 * Too large/small number outside the target integer range.
4857 */
4858 else
4859 {
4860 fFsw |= X86_FSW_IE;
4861 if (fFcw & X86_FCW_IM)
4862 *pd80Dst = s_d80Indefinite;
4863 else
4864 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4865 }
4866 }
4867 /*
4868 * Map both +0 and -0 to integer zero (signless/+).
4869 */
4870 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4871 *pd80Dst = s_ad80Zeros[fSignIn];
4872 /*
4873 * Denormals are just really tiny sub-zero numbers that are either rounded
4874 * to zero, 1 or -1 depending on sign and rounding control.
4875 */
4876 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4877 {
4878 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4879 *pd80Dst = s_ad80Zeros[fSignIn];
4880 else
4881 {
4882 *pd80Dst = s_ad80One[fSignIn];
4883 fFsw |= X86_FSW_C1;
4884 }
4885 fFsw |= X86_FSW_PE;
4886 if (!(fFcw & X86_FCW_PM))
4887 fFsw |= X86_FSW_ES | X86_FSW_B;
4888 }
4889 /*
4890 * All other special values are considered invalid arguments and result
4891 * in an IE exception and indefinite value if masked.
4892 */
4893 else
4894 {
4895 fFsw |= X86_FSW_IE;
4896 if (fFcw & X86_FCW_IM)
4897 *pd80Dst = s_d80Indefinite;
4898 else
4899 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4900 }
4901 *pu16FSW = fFsw;
4902}
4903
4904
4905/*********************************************************************************************************************************
4906* FPU Helpers *
4907*********************************************************************************************************************************/
4908AssertCompileSize(RTFLOAT128U, 16);
4909AssertCompileSize(RTFLOAT80U, 10);
4910AssertCompileSize(RTFLOAT64U, 8);
4911AssertCompileSize(RTFLOAT32U, 4);
4912
4913/**
4914 * Normalizes a possible pseudo-normal value.
4915 *
4916 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4917 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4918 * i.e. changing uExponent from 0 to 1.
4919 *
4920 * This macro will declare a RTFLOAT80U with the name given by
4921 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4922 * a normalization was performed.
4923 *
4924 * @note This must be applied before calling SoftFloat with a value that couldbe
4925 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4926 * correctly.
4927 */
4928#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4929 RTFLOAT80U a_r80ValNormalized; \
4930 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4931 { \
4932 a_r80ValNormalized = *a_pr80Val; \
4933 a_r80ValNormalized.s.uExponent = 1; \
4934 a_pr80Val = &a_r80ValNormalized; \
4935 } else do {} while (0)
4936
4937#ifdef IEM_WITH_FLOAT128_FOR_FPU
4938
4939DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4940{
4941 int fNew;
4942 switch (fFcw & X86_FCW_RC_MASK)
4943 {
4944 default:
4945 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4946 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4947 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4948 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4949 }
4950 int fOld = fegetround();
4951 fesetround(fNew);
4952 return fOld;
4953}
4954
4955
4956DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4957{
4958 fesetround(fOld);
4959}
4960
4961DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4962{
4963 RT_NOREF(fFcw);
4964 RTFLOAT128U Tmp;
4965 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4966 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4967 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4968 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4969 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4970 {
4971 Assert(Tmp.s.uExponent == 0);
4972 Tmp.s2.uSignAndExponent++;
4973 }
4974 return *(_Float128 *)&Tmp;
4975}
4976
4977
4978DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
4979{
4980 RT_NOREF(fFcw);
4981 RTFLOAT128U Tmp;
4982 *(_Float128 *)&Tmp = rd128ValSrc;
4983 ASMCompilerBarrier();
4984 if (RTFLOAT128U_IS_NORMAL(&Tmp))
4985 {
4986 pr80Dst->s.fSign = Tmp.s64.fSign;
4987 pr80Dst->s.uExponent = Tmp.s64.uExponent;
4988 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
4989 | Tmp.s64.uFractionLo >> (64 - 15);
4990
4991 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4992 unsigned const cShiftOff = 64 - 15;
4993 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4994 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
4995 if (uRoundedOff)
4996 {
4997 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4998 ? RT_BIT_64(cShiftOff - 1)
4999 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5000 ? fRoundingOffMask
5001 : 0;
5002 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5003 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5004 || uRoundedOff != uRoundingAdd)
5005 {
5006 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5007 {
5008 uFraction += 1;
5009 if (!(uFraction & RT_BIT_64(63)))
5010 { /* likely */ }
5011 else
5012 {
5013 uFraction >>= 1;
5014 pr80Dst->s.uExponent++;
5015 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5016 return fFsw;
5017 }
5018 fFsw |= X86_FSW_C1;
5019 }
5020 }
5021 fFsw |= X86_FSW_PE;
5022 if (!(fFcw & X86_FCW_PM))
5023 fFsw |= X86_FSW_ES | X86_FSW_B;
5024 }
5025 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5026 }
5027 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5028 {
5029 pr80Dst->s.fSign = Tmp.s64.fSign;
5030 pr80Dst->s.uExponent = 0;
5031 pr80Dst->s.uMantissa = 0;
5032 }
5033 else if (RTFLOAT128U_IS_INF(&Tmp))
5034 {
5035 pr80Dst->s.fSign = Tmp.s64.fSign;
5036 pr80Dst->s.uExponent = 0;
5037 pr80Dst->s.uMantissa = 0;
5038 }
5039 return fFsw;
5040}
5041
5042
5043#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5044
5045/** Initializer for the SoftFloat state structure. */
5046# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5047 { \
5048 softfloat_tininess_afterRounding, \
5049 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5050 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5051 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5052 : (uint8_t)softfloat_round_minMag, \
5053 0, \
5054 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5055 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5056 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5057 }
5058
5059/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5060# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5061 ( (a_fFsw) \
5062 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5063 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5064 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5065 ? X86_FSW_ES | X86_FSW_B : 0) )
5066
5067
5068DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5069{
5070 RT_NOREF(fFcw);
5071 Assert(cBits > 64);
5072# if 0 /* rounding does not seem to help */
5073 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5074 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5075 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5076 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5077 {
5078 uint64_t uOld = r128.v[0];
5079 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5080 if (r128.v[0] < uOld)
5081 r128.v[1] += 1;
5082 }
5083# else
5084 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5085# endif
5086 return r128;
5087}
5088
5089
5090DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5091{
5092 RT_NOREF(fFcw);
5093 Assert(cBits > 64);
5094# if 0 /* rounding does not seem to help, not even on constants */
5095 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5096 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5097 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5098 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5099 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5100 {
5101 uint64_t uOld = r128.v[0];
5102 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5103 if (r128.v[0] < uOld)
5104 r128.v[1] += 1;
5105 }
5106 return r128;
5107# else
5108 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5109 return r128;
5110# endif
5111}
5112
5113
5114# if 0 /* unused */
5115DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5116{
5117 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5118 return r128;
5119}
5120# endif
5121
5122
5123/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5124DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5125{
5126 extFloat80_t Tmp;
5127 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5128 Tmp.signif = pr80Val->s2.uMantissa;
5129 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5130 return extF80_to_f128(Tmp, &Ignored);
5131}
5132
5133
5134/**
5135 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5136 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5137 *
5138 * This is only a structure format conversion, nothing else.
5139 */
5140DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5141{
5142 extFloat80_t Tmp;
5143 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5144 Tmp.signif = pr80Val->s2.uMantissa;
5145 return Tmp;
5146}
5147
5148
5149/**
5150 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5151 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5152 *
5153 * This is only a structure format conversion, nothing else.
5154 */
5155DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5156{
5157 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5158 pr80Dst->s2.uMantissa = r80XSrc.signif;
5159 return pr80Dst;
5160}
5161
5162
5163DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5164{
5165 RT_NOREF(fFcw);
5166 RTFLOAT128U Tmp;
5167 *(float128_t *)&Tmp = r128Src;
5168 ASMCompilerBarrier();
5169
5170 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5171 {
5172 pr80Dst->s.fSign = Tmp.s64.fSign;
5173 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5174 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5175 | Tmp.s64.uFractionLo >> (64 - 15);
5176
5177 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5178 unsigned const cShiftOff = 64 - 15;
5179 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5180 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5181 if (uRoundedOff)
5182 {
5183 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5184 ? RT_BIT_64(cShiftOff - 1)
5185 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5186 ? fRoundingOffMask
5187 : 0;
5188 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5189 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5190 || uRoundedOff != uRoundingAdd)
5191 {
5192 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5193 {
5194 uFraction += 1;
5195 if (!(uFraction & RT_BIT_64(63)))
5196 { /* likely */ }
5197 else
5198 {
5199 uFraction >>= 1;
5200 pr80Dst->s.uExponent++;
5201 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5202 return fFsw;
5203 }
5204 fFsw |= X86_FSW_C1;
5205 }
5206 }
5207 fFsw |= X86_FSW_PE;
5208 if (!(fFcw & X86_FCW_PM))
5209 fFsw |= X86_FSW_ES | X86_FSW_B;
5210 }
5211
5212 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5213 }
5214 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5215 {
5216 pr80Dst->s.fSign = Tmp.s64.fSign;
5217 pr80Dst->s.uExponent = 0;
5218 pr80Dst->s.uMantissa = 0;
5219 }
5220 else if (RTFLOAT128U_IS_INF(&Tmp))
5221 {
5222 pr80Dst->s.fSign = Tmp.s64.fSign;
5223 pr80Dst->s.uExponent = 0;
5224 pr80Dst->s.uMantissa = 0;
5225 }
5226 return fFsw;
5227}
5228
5229
5230/**
5231 * Helper for transfering exception and C1 to FSW and setting the result value
5232 * accordingly.
5233 *
5234 * @returns Updated FSW.
5235 * @param pSoftState The SoftFloat state following the operation.
5236 * @param r80XResult The result of the SoftFloat operation.
5237 * @param pr80Result Where to store the result for IEM.
5238 * @param fFcw The FPU control word.
5239 * @param fFsw The FSW before the operation, with necessary bits
5240 * cleared and such.
5241 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5242 * raised.
5243 */
5244DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5245 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5246 PCRTFLOAT80U pr80XcptResult)
5247{
5248 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5249 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5250 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5251 fFsw |= X86_FSW_ES | X86_FSW_B;
5252
5253 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5254 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5255 else
5256 {
5257 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5258 *pr80Result = *pr80XcptResult;
5259 }
5260 return fFsw;
5261}
5262
5263
5264/**
5265 * Helper doing polynomial evaluation using Horner's method.
5266 *
5267 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5268 */
5269float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5270 unsigned cPrecision, softfloat_state_t *pSoftState)
5271{
5272 Assert(cHornerConsts > 1);
5273 size_t i = cHornerConsts - 1;
5274 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5275 while (i-- > 0)
5276 {
5277 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5278 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5279 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5280 }
5281 return r128Result;
5282}
5283
5284#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5285
5286
5287/**
5288 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5289 * mantissa, exponent and sign.
5290 *
5291 * @returns Updated FSW.
5292 * @param pr80Dst Where to return the composed value.
5293 * @param fSign The sign.
5294 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5295 * ignored and should be zero. This will probably be
5296 * modified during normalization and rounding.
5297 * @param iExponent Unbiased exponent.
5298 * @param fFcw The FPU control word.
5299 * @param fFsw The FPU status word.
5300 */
5301static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5302 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5303{
5304 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5305
5306 iExponent += RTFLOAT80U_EXP_BIAS;
5307
5308 /* Do normalization if necessary and possible. */
5309 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5310 {
5311 int cShift = 192 - RTUInt256BitCount(puMantissa);
5312 if (iExponent > cShift)
5313 iExponent -= cShift;
5314 else
5315 {
5316 if (fFcw & X86_FCW_UM)
5317 {
5318 if (iExponent > 0)
5319 cShift = --iExponent;
5320 else
5321 cShift = 0;
5322 }
5323 iExponent -= cShift;
5324 }
5325 RTUInt256AssignShiftLeft(puMantissa, cShift);
5326 }
5327
5328 /* Do rounding. */
5329 uint64_t uMantissa = puMantissa->QWords.qw2;
5330 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5331 {
5332 bool fAdd;
5333 switch (fFcw & X86_FCW_RC_MASK)
5334 {
5335 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5336 case X86_FCW_RC_NEAREST:
5337 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5338 {
5339 if ( (uMantissa & 1)
5340 || puMantissa->QWords.qw0 != 0
5341 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5342 {
5343 fAdd = true;
5344 break;
5345 }
5346 uMantissa &= ~(uint64_t)1;
5347 }
5348 fAdd = false;
5349 break;
5350 case X86_FCW_RC_ZERO:
5351 fAdd = false;
5352 break;
5353 case X86_FCW_RC_UP:
5354 fAdd = !fSign;
5355 break;
5356 case X86_FCW_RC_DOWN:
5357 fAdd = fSign;
5358 break;
5359 }
5360 if (fAdd)
5361 {
5362 uint64_t const uTmp = uMantissa;
5363 uMantissa = uTmp + 1;
5364 if (uMantissa < uTmp)
5365 {
5366 uMantissa >>= 1;
5367 uMantissa |= RT_BIT_64(63);
5368 iExponent++;
5369 }
5370 fFsw |= X86_FSW_C1;
5371 }
5372 fFsw |= X86_FSW_PE;
5373 if (!(fFcw & X86_FCW_PM))
5374 fFsw |= X86_FSW_ES | X86_FSW_B;
5375 }
5376
5377 /* Check for underflow (denormals). */
5378 if (iExponent <= 0)
5379 {
5380 if (fFcw & X86_FCW_UM)
5381 {
5382 if (uMantissa & RT_BIT_64(63))
5383 uMantissa >>= 1;
5384 iExponent = 0;
5385 }
5386 else
5387 {
5388 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5389 fFsw |= X86_FSW_ES | X86_FSW_B;
5390 }
5391 fFsw |= X86_FSW_UE;
5392 }
5393 /* Check for overflow */
5394 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5395 {
5396 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5397 }
5398
5399 /* Compose the result. */
5400 pr80Dst->s.uMantissa = uMantissa;
5401 pr80Dst->s.uExponent = iExponent;
5402 pr80Dst->s.fSign = fSign;
5403 return fFsw;
5404}
5405
5406
5407/**
5408 * See also iemAImpl_fld_r80_from_r32
5409 */
5410static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5411{
5412 uint16_t fFsw = 0;
5413 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5414 {
5415 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5416 pr80Dst->sj64.fInteger = 1;
5417 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5418 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5419 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5420 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5421 }
5422 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5423 {
5424 pr80Dst->s.fSign = pr32Val->s.fSign;
5425 pr80Dst->s.uExponent = 0;
5426 pr80Dst->s.uMantissa = 0;
5427 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5428 }
5429 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5430 {
5431 /* Subnormal -> normalized + X86_FSW_DE return. */
5432 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5433 pr80Dst->sj64.fInteger = 1;
5434 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5435 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5436 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5437 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5438 fFsw = X86_FSW_DE;
5439 }
5440 else if (RTFLOAT32U_IS_INF(pr32Val))
5441 {
5442 pr80Dst->s.fSign = pr32Val->s.fSign;
5443 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5444 pr80Dst->s.uMantissa = RT_BIT_64(63);
5445 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5446 }
5447 else
5448 {
5449 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5450 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5451 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5452 pr80Dst->sj64.fInteger = 1;
5453 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5454 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5455 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5456 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5457 }
5458 return fFsw;
5459}
5460
5461
5462/**
5463 * See also iemAImpl_fld_r80_from_r64
5464 */
5465static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5466{
5467 uint16_t fFsw = 0;
5468 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5469 {
5470 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5471 pr80Dst->sj64.fInteger = 1;
5472 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5473 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5474 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5475 }
5476 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5477 {
5478 pr80Dst->s.fSign = pr64Val->s.fSign;
5479 pr80Dst->s.uExponent = 0;
5480 pr80Dst->s.uMantissa = 0;
5481 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5482 }
5483 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5484 {
5485 /* Subnormal values gets normalized. */
5486 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5487 pr80Dst->sj64.fInteger = 1;
5488 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5489 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5490 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5491 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5492 fFsw = X86_FSW_DE;
5493 }
5494 else if (RTFLOAT64U_IS_INF(pr64Val))
5495 {
5496 pr80Dst->s.fSign = pr64Val->s.fSign;
5497 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5498 pr80Dst->s.uMantissa = RT_BIT_64(63);
5499 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5500 }
5501 else
5502 {
5503 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5504 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5505 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5506 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5507 pr80Dst->sj64.fInteger = 1;
5508 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5509 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5510 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5511 }
5512 return fFsw;
5513}
5514
5515
5516/**
5517 * See also EMIT_FILD.
5518 */
5519#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5520static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5521{ \
5522 if (iVal == 0) \
5523 { \
5524 pr80Dst->s.fSign = 0; \
5525 pr80Dst->s.uExponent = 0; \
5526 pr80Dst->s.uMantissa = 0; \
5527 } \
5528 else \
5529 { \
5530 if (iVal > 0) \
5531 pr80Dst->s.fSign = 0; \
5532 else \
5533 { \
5534 pr80Dst->s.fSign = 1; \
5535 iVal = -iVal; \
5536 } \
5537 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5538 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5539 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5540 } \
5541 return pr80Dst; \
5542}
5543EMIT_CONVERT_IXX_TO_R80(16)
5544EMIT_CONVERT_IXX_TO_R80(32)
5545//EMIT_CONVERT_IXX_TO_R80(64)
5546
5547/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5548#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5549IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5550{ \
5551 RTFLOAT80U r80Val2; \
5552 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5553 Assert(!fFsw || fFsw == X86_FSW_DE); \
5554 if (fFsw) \
5555 { \
5556 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5557 fFsw = 0; \
5558 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5559 { \
5560 pFpuRes->r80Result = *pr80Val1; \
5561 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5562 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5563 return; \
5564 } \
5565 } \
5566 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5567 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5568}
5569
5570/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5571#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5572IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5573{ \
5574 RTFLOAT80U r80Val2; \
5575 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5576 Assert(!fFsw || fFsw == X86_FSW_DE); \
5577 if (fFsw) \
5578 { \
5579 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5580 fFsw = 0; \
5581 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5582 { \
5583 pFpuRes->r80Result = *pr80Val1; \
5584 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5585 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5586 return; \
5587 } \
5588 } \
5589 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5590 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5591}
5592
5593/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5594#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5595IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5596{ \
5597 RTFLOAT80U r80Val2; \
5598 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5599 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5600}
5601
5602/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5603#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5604IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5605{ \
5606 RTFLOAT80U r80Val2; \
5607 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5608 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5609}
5610
5611
5612
5613/*********************************************************************************************************************************
5614* x86 FPU Division Operations *
5615*********************************************************************************************************************************/
5616
5617/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5618static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5619 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5620{
5621 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5622 {
5623 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5624 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5625 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5626 }
5627 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5628 { /* Div by zero. */
5629 if (fFcw & X86_FCW_ZM)
5630 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5631 else
5632 {
5633 *pr80Result = *pr80Val1Org;
5634 fFsw |= X86_FSW_ES | X86_FSW_B;
5635 }
5636 fFsw |= X86_FSW_ZE;
5637 }
5638 else
5639 { /* Invalid operand */
5640 if (fFcw & X86_FCW_IM)
5641 *pr80Result = g_r80Indefinite;
5642 else
5643 {
5644 *pr80Result = *pr80Val1Org;
5645 fFsw |= X86_FSW_ES | X86_FSW_B;
5646 }
5647 fFsw |= X86_FSW_IE;
5648 }
5649 return fFsw;
5650}
5651
5652
5653IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5654 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5655{
5656 uint16_t const fFcw = pFpuState->FCW;
5657 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5658
5659 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5660 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5661 {
5662 if (fFcw & X86_FCW_IM)
5663 pFpuRes->r80Result = g_r80Indefinite;
5664 else
5665 {
5666 pFpuRes->r80Result = *pr80Val1;
5667 fFsw |= X86_FSW_ES | X86_FSW_B;
5668 }
5669 fFsw |= X86_FSW_IE;
5670 }
5671 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5672 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5673 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5674 {
5675 if (fFcw & X86_FCW_DM)
5676 {
5677 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5678 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5679 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5680 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5681 }
5682 else
5683 {
5684 pFpuRes->r80Result = *pr80Val1;
5685 fFsw |= X86_FSW_ES | X86_FSW_B;
5686 }
5687 fFsw |= X86_FSW_DE;
5688 }
5689 /* SoftFloat can handle the rest: */
5690 else
5691 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5692
5693 pFpuRes->FSW = fFsw;
5694}
5695
5696
5697EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5698EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5699EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5700EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5701
5702
5703IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5704 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5705{
5706 uint16_t const fFcw = pFpuState->FCW;
5707 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5708
5709 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5710 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5711 {
5712 if (fFcw & X86_FCW_IM)
5713 pFpuRes->r80Result = g_r80Indefinite;
5714 else
5715 {
5716 pFpuRes->r80Result = *pr80Val1;
5717 fFsw |= X86_FSW_ES | X86_FSW_B;
5718 }
5719 fFsw |= X86_FSW_IE;
5720 }
5721 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5722 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5723 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5724 {
5725 if (fFcw & X86_FCW_DM)
5726 {
5727 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5728 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5729 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5730 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5731 }
5732 else
5733 {
5734 pFpuRes->r80Result = *pr80Val1;
5735 fFsw |= X86_FSW_ES | X86_FSW_B;
5736 }
5737 fFsw |= X86_FSW_DE;
5738 }
5739 /* SoftFloat can handle the rest: */
5740 else
5741 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5742
5743 pFpuRes->FSW = fFsw;
5744}
5745
5746
5747EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5748EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5749EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5750EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5751
5752
5753/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5754static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5755 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5756{
5757 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5758 {
5759 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5760 uint16_t fCxFlags = 0;
5761 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5762 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5763 &fCxFlags, &SoftState);
5764 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5765 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5766 if ( !(fFsw & X86_FSW_IE)
5767 && !RTFLOAT80U_IS_NAN(pr80Result)
5768 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5769 {
5770 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5771 fFsw |= fCxFlags & X86_FSW_C_MASK;
5772 }
5773 return fFsw;
5774 }
5775
5776 /* Invalid operand */
5777 if (fFcw & X86_FCW_IM)
5778 *pr80Result = g_r80Indefinite;
5779 else
5780 {
5781 *pr80Result = *pr80Val1Org;
5782 fFsw |= X86_FSW_ES | X86_FSW_B;
5783 }
5784 return fFsw | X86_FSW_IE;
5785}
5786
5787
5788static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5789 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5790{
5791 uint16_t const fFcw = pFpuState->FCW;
5792 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5793
5794 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5795 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5796 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5797 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5798 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5799 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5800 {
5801 if (fFcw & X86_FCW_IM)
5802 pFpuRes->r80Result = g_r80Indefinite;
5803 else
5804 {
5805 pFpuRes->r80Result = *pr80Val1;
5806 fFsw |= X86_FSW_ES | X86_FSW_B;
5807 }
5808 fFsw |= X86_FSW_IE;
5809 }
5810 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5811 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5812 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5813 {
5814 if (fFcw & X86_FCW_DM)
5815 {
5816 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5817 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5818 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5819 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5820 pr80Val1Org, fLegacyInstr);
5821 }
5822 else
5823 {
5824 pFpuRes->r80Result = *pr80Val1;
5825 fFsw |= X86_FSW_ES | X86_FSW_B;
5826 }
5827 fFsw |= X86_FSW_DE;
5828 }
5829 /* SoftFloat can handle the rest: */
5830 else
5831 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5832 pr80Val1, fLegacyInstr);
5833
5834 pFpuRes->FSW = fFsw;
5835}
5836
5837
5838IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5839 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5840{
5841 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5842}
5843
5844
5845IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5846 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5847{
5848 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5849}
5850
5851
5852/*********************************************************************************************************************************
5853* x87 FPU Multiplication Operations *
5854*********************************************************************************************************************************/
5855
5856/** Worker for iemAImpl_fmul_r80_by_r80. */
5857static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5858 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5859{
5860 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5861 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5862 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5863}
5864
5865
5866IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5867 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5868{
5869 uint16_t const fFcw = pFpuState->FCW;
5870 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5871
5872 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5873 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5874 {
5875 if (fFcw & X86_FCW_IM)
5876 pFpuRes->r80Result = g_r80Indefinite;
5877 else
5878 {
5879 pFpuRes->r80Result = *pr80Val1;
5880 fFsw |= X86_FSW_ES | X86_FSW_B;
5881 }
5882 fFsw |= X86_FSW_IE;
5883 }
5884 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5885 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5886 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5887 {
5888 if (fFcw & X86_FCW_DM)
5889 {
5890 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5891 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5892 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5893 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5894 }
5895 else
5896 {
5897 pFpuRes->r80Result = *pr80Val1;
5898 fFsw |= X86_FSW_ES | X86_FSW_B;
5899 }
5900 fFsw |= X86_FSW_DE;
5901 }
5902 /* SoftFloat can handle the rest: */
5903 else
5904 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5905
5906 pFpuRes->FSW = fFsw;
5907}
5908
5909
5910EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5911EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5912EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5913EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5914
5915
5916/*********************************************************************************************************************************
5917* x87 FPU Addition *
5918*********************************************************************************************************************************/
5919
5920/** Worker for iemAImpl_fadd_r80_by_r80. */
5921static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5922 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5923{
5924 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5925 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5926 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5927}
5928
5929
5930IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5931 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5932{
5933 uint16_t const fFcw = pFpuState->FCW;
5934 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5935
5936 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5937 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5938 {
5939 if (fFcw & X86_FCW_IM)
5940 pFpuRes->r80Result = g_r80Indefinite;
5941 else
5942 {
5943 pFpuRes->r80Result = *pr80Val1;
5944 fFsw |= X86_FSW_ES | X86_FSW_B;
5945 }
5946 fFsw |= X86_FSW_IE;
5947 }
5948 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5949 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5950 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5951 {
5952 if (fFcw & X86_FCW_DM)
5953 {
5954 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5955 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5956 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5957 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5958 }
5959 else
5960 {
5961 pFpuRes->r80Result = *pr80Val1;
5962 fFsw |= X86_FSW_ES | X86_FSW_B;
5963 }
5964 fFsw |= X86_FSW_DE;
5965 }
5966 /* SoftFloat can handle the rest: */
5967 else
5968 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5969
5970 pFpuRes->FSW = fFsw;
5971}
5972
5973
5974EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
5975EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
5976EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
5977EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
5978
5979
5980/*********************************************************************************************************************************
5981* x87 FPU Subtraction *
5982*********************************************************************************************************************************/
5983
5984/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
5985static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5986 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5987{
5988 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5989 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5990 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5991}
5992
5993
5994IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5995 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5996{
5997 uint16_t const fFcw = pFpuState->FCW;
5998 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5999
6000 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6001 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6002 {
6003 if (fFcw & X86_FCW_IM)
6004 pFpuRes->r80Result = g_r80Indefinite;
6005 else
6006 {
6007 pFpuRes->r80Result = *pr80Val1;
6008 fFsw |= X86_FSW_ES | X86_FSW_B;
6009 }
6010 fFsw |= X86_FSW_IE;
6011 }
6012 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6013 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6014 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6015 {
6016 if (fFcw & X86_FCW_DM)
6017 {
6018 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6019 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6020 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6021 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6022 }
6023 else
6024 {
6025 pFpuRes->r80Result = *pr80Val1;
6026 fFsw |= X86_FSW_ES | X86_FSW_B;
6027 }
6028 fFsw |= X86_FSW_DE;
6029 }
6030 /* SoftFloat can handle the rest: */
6031 else
6032 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6033
6034 pFpuRes->FSW = fFsw;
6035}
6036
6037
6038EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6039EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6040EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6041EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6042
6043
6044/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6045IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6046 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6047{
6048 uint16_t const fFcw = pFpuState->FCW;
6049 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6050
6051 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6052 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6053 {
6054 if (fFcw & X86_FCW_IM)
6055 pFpuRes->r80Result = g_r80Indefinite;
6056 else
6057 {
6058 pFpuRes->r80Result = *pr80Val1;
6059 fFsw |= X86_FSW_ES | X86_FSW_B;
6060 }
6061 fFsw |= X86_FSW_IE;
6062 }
6063 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6064 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6065 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6066 {
6067 if (fFcw & X86_FCW_DM)
6068 {
6069 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6070 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6071 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6072 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6073 }
6074 else
6075 {
6076 pFpuRes->r80Result = *pr80Val1;
6077 fFsw |= X86_FSW_ES | X86_FSW_B;
6078 }
6079 fFsw |= X86_FSW_DE;
6080 }
6081 /* SoftFloat can handle the rest: */
6082 else
6083 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6084
6085 pFpuRes->FSW = fFsw;
6086}
6087
6088
6089EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6090EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6091EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6092EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6093
6094
6095/*********************************************************************************************************************************
6096* x87 FPU Trigometric Operations *
6097*********************************************************************************************************************************/
6098
6099
6100IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6101 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6102{
6103 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6104 AssertReleaseFailed();
6105}
6106
6107#endif /* IEM_WITHOUT_ASSEMBLY */
6108
6109IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6110 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6111{
6112 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6113}
6114
6115IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6116 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6117{
6118 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6119}
6120
6121
6122#if defined(IEM_WITHOUT_ASSEMBLY)
6123IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6124{
6125 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6126 AssertReleaseFailed();
6127}
6128#endif /* IEM_WITHOUT_ASSEMBLY */
6129
6130IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6131{
6132 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6133}
6134
6135IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6136{
6137 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6138}
6139
6140
6141#ifdef IEM_WITHOUT_ASSEMBLY
6142IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6143{
6144 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6145 AssertReleaseFailed();
6146}
6147#endif /* IEM_WITHOUT_ASSEMBLY */
6148
6149IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6150{
6151 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6152}
6153
6154IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6155{
6156 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6157}
6158
6159#ifdef IEM_WITHOUT_ASSEMBLY
6160IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6161{
6162 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6163 AssertReleaseFailed();
6164}
6165#endif /* IEM_WITHOUT_ASSEMBLY */
6166
6167IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6168{
6169 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6170}
6171
6172IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6173{
6174 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6175}
6176
6177
6178#ifdef IEM_WITHOUT_ASSEMBLY
6179IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6180{
6181 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6182 AssertReleaseFailed();
6183}
6184#endif /* IEM_WITHOUT_ASSEMBLY */
6185
6186IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6187{
6188 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6189}
6190
6191IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6192{
6193 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6194}
6195
6196#ifdef IEM_WITHOUT_ASSEMBLY
6197
6198
6199/*********************************************************************************************************************************
6200* x87 FPU Compare and Testing Operations *
6201*********************************************************************************************************************************/
6202
6203IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6204{
6205 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6206
6207 if (RTFLOAT80U_IS_ZERO(pr80Val))
6208 fFsw |= X86_FSW_C3;
6209 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6210 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6211 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6212 {
6213 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6214 if (!(pFpuState->FCW & X86_FCW_DM))
6215 fFsw |= X86_FSW_ES | X86_FSW_B;
6216 }
6217 else
6218 {
6219 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6220 if (!(pFpuState->FCW & X86_FCW_IM))
6221 fFsw |= X86_FSW_ES | X86_FSW_B;
6222 }
6223
6224 *pu16Fsw = fFsw;
6225}
6226
6227
6228IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6229{
6230 RT_NOREF(pFpuState);
6231 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6232
6233 /* C1 = sign bit (always, even if empty Intel says). */
6234 if (pr80Val->s.fSign)
6235 fFsw |= X86_FSW_C1;
6236
6237 /* Classify the value in C0, C2, C3. */
6238 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6239 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6240 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6241 fFsw |= X86_FSW_C2;
6242 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6243 fFsw |= X86_FSW_C3;
6244 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6245 fFsw |= X86_FSW_C0;
6246 else if (RTFLOAT80U_IS_INF(pr80Val))
6247 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6248 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6249 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6250 /* whatever else: 0 */
6251
6252 *pu16Fsw = fFsw;
6253}
6254
6255
6256/**
6257 * Worker for fcom, fucom, and friends.
6258 */
6259static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6260 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6261{
6262 /*
6263 * Unpack the values.
6264 */
6265 bool const fSign1 = pr80Val1->s.fSign;
6266 int32_t iExponent1 = pr80Val1->s.uExponent;
6267 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6268
6269 bool const fSign2 = pr80Val2->s.fSign;
6270 int32_t iExponent2 = pr80Val2->s.uExponent;
6271 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6272
6273 /*
6274 * Check for invalid inputs.
6275 */
6276 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6277 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6278 {
6279 if (!(fFcw & X86_FCW_IM))
6280 fFsw |= X86_FSW_ES | X86_FSW_B;
6281 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6282 }
6283
6284 /*
6285 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6286 */
6287 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6288 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6289 {
6290 if ( fIeOnAllNaNs
6291 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6292 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6293 {
6294 fFsw |= X86_FSW_IE;
6295 if (!(fFcw & X86_FCW_IM))
6296 fFsw |= X86_FSW_ES | X86_FSW_B;
6297 }
6298 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6299 }
6300
6301 /*
6302 * Normalize the values.
6303 */
6304 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6305 {
6306 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6307 iExponent1 = 1;
6308 else
6309 {
6310 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6311 uMantissa1 <<= iExponent1;
6312 iExponent1 = 1 - iExponent1;
6313 }
6314 fFsw |= X86_FSW_DE;
6315 if (!(fFcw & X86_FCW_DM))
6316 fFsw |= X86_FSW_ES | X86_FSW_B;
6317 }
6318
6319 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6320 {
6321 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6322 iExponent2 = 1;
6323 else
6324 {
6325 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6326 uMantissa2 <<= iExponent2;
6327 iExponent2 = 1 - iExponent2;
6328 }
6329 fFsw |= X86_FSW_DE;
6330 if (!(fFcw & X86_FCW_DM))
6331 fFsw |= X86_FSW_ES | X86_FSW_B;
6332 }
6333
6334 /*
6335 * Test if equal (val1 == val2):
6336 */
6337 if ( uMantissa1 == uMantissa2
6338 && iExponent1 == iExponent2
6339 && ( fSign1 == fSign2
6340 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6341 fFsw |= X86_FSW_C3;
6342 /*
6343 * Test if less than (val1 < val2):
6344 */
6345 else if (fSign1 && !fSign2)
6346 fFsw |= X86_FSW_C0;
6347 else if (fSign1 == fSign2)
6348 {
6349 /* Zeros are problematic, however at the most one can be zero here. */
6350 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6351 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6352 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6353 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6354
6355 if ( fSign1
6356 ^ ( iExponent1 < iExponent2
6357 || ( iExponent1 == iExponent2
6358 && uMantissa1 < uMantissa2 ) ) )
6359 fFsw |= X86_FSW_C0;
6360 }
6361 /* else: No flags set if greater. */
6362
6363 return fFsw;
6364}
6365
6366
6367IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6368 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6369{
6370 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6371}
6372
6373
6374
6375
6376IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6377 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6378{
6379 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6380}
6381
6382
6383IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6384 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6385{
6386 RTFLOAT80U r80Val2;
6387 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6388 Assert(!fFsw || fFsw == X86_FSW_DE);
6389 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6390 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6391 {
6392 if (!(pFpuState->FCW & X86_FCW_DM))
6393 fFsw |= X86_FSW_ES | X86_FSW_B;
6394 *pfFsw |= fFsw;
6395 }
6396}
6397
6398
6399IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6400 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6401{
6402 RTFLOAT80U r80Val2;
6403 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6404 Assert(!fFsw || fFsw == X86_FSW_DE);
6405 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6406 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6407 {
6408 if (!(pFpuState->FCW & X86_FCW_DM))
6409 fFsw |= X86_FSW_ES | X86_FSW_B;
6410 *pfFsw |= fFsw;
6411 }
6412}
6413
6414
6415IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6416 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6417{
6418 RTFLOAT80U r80Val2;
6419 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6420 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6421}
6422
6423
6424IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6425 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6426{
6427 RTFLOAT80U r80Val2;
6428 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6429 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6430}
6431
6432
6433/**
6434 * Worker for fcomi & fucomi.
6435 */
6436static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6437 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6438{
6439 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6440 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6441 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6442 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6443
6444 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6445 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6446 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6447}
6448
6449
6450IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6451 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6452{
6453 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6454}
6455
6456
6457IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6458 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6459{
6460 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6461}
6462
6463
6464/*********************************************************************************************************************************
6465* x87 FPU Other Operations *
6466*********************************************************************************************************************************/
6467
6468/**
6469 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6470 */
6471static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6472{
6473 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6474 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6475 true /*exact / generate #PE */, &SoftState));
6476 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6477}
6478
6479
6480IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6481{
6482 uint16_t const fFcw = pFpuState->FCW;
6483 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6484
6485 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6486 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6487 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6488 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6489 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6490 || RTFLOAT80U_IS_INF(pr80Val))
6491 pFpuRes->r80Result = *pr80Val;
6492 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6493 {
6494 fFsw |= X86_FSW_DE;
6495 if (fFcw & X86_FCW_DM)
6496 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6497 else
6498 {
6499 pFpuRes->r80Result = *pr80Val;
6500 fFsw |= X86_FSW_ES | X86_FSW_B;
6501 }
6502 }
6503 else
6504 {
6505 if (fFcw & X86_FCW_IM)
6506 {
6507 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6508 pFpuRes->r80Result = g_r80Indefinite;
6509 else
6510 {
6511 pFpuRes->r80Result = *pr80Val;
6512 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6513 }
6514 }
6515 else
6516 {
6517 pFpuRes->r80Result = *pr80Val;
6518 fFsw |= X86_FSW_ES | X86_FSW_B;
6519 }
6520 fFsw |= X86_FSW_IE;
6521 }
6522 pFpuRes->FSW = fFsw;
6523}
6524
6525
6526IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6527 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6528{
6529 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6530 it does everything we need it to do. */
6531 uint16_t const fFcw = pFpuState->FCW;
6532 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6533 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6534 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6535 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6536}
6537
6538
6539/**
6540 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6541 */
6542static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6543{
6544 Assert(!pr80Val->s.fSign);
6545 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6546 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6547 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6548}
6549
6550
6551IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6552{
6553 uint16_t const fFcw = pFpuState->FCW;
6554 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6555
6556 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6557 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6558 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6559 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6560 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6561 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6562 pFpuRes->r80Result = *pr80Val;
6563 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6564 {
6565 fFsw |= X86_FSW_DE;
6566 if (fFcw & X86_FCW_DM)
6567 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6568 else
6569 {
6570 pFpuRes->r80Result = *pr80Val;
6571 fFsw |= X86_FSW_ES | X86_FSW_B;
6572 }
6573 }
6574 else
6575 {
6576 if (fFcw & X86_FCW_IM)
6577 {
6578 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6579 pFpuRes->r80Result = g_r80Indefinite;
6580 else
6581 {
6582 pFpuRes->r80Result = *pr80Val;
6583 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6584 }
6585 }
6586 else
6587 {
6588 pFpuRes->r80Result = *pr80Val;
6589 fFsw |= X86_FSW_ES | X86_FSW_B;
6590 }
6591 fFsw |= X86_FSW_IE;
6592 }
6593 pFpuRes->FSW = fFsw;
6594}
6595
6596
6597/**
6598 * @code{.unparsed}
6599 * x x * ln2
6600 * f(x) = 2 - 1 = e - 1
6601 *
6602 * @endcode
6603 *
6604 * We can approximate e^x by a Taylor/Maclaurin series (see
6605 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
6606 * @code{.unparsed}
6607 * n 0 1 2 3 4
6608 * inf x x x x x x
6609 * SUM ----- = --- + --- + --- + --- + --- + ...
6610 * n=0 n! 0! 1! 2! 3! 4!
6611 *
6612 * 2 3 4
6613 * x x x
6614 * = 1 + x + --- + --- + --- + ...
6615 * 2! 3! 4!
6616 * @endcode
6617 *
6618 * Given z = x * ln2, we get:
6619 * @code{.unparsed}
6620 * 2 3 4 n
6621 * z z z z z
6622 * e - 1 = z + --- + --- + --- + ... + ---
6623 * 2! 3! 4! n!
6624 * @endcode
6625 *
6626 * Wanting to use Horner's method, we move one z outside and get:
6627 * @code{.unparsed}
6628 * 2 3 (n-1)
6629 * z z z z
6630 * = z ( 1 + --- + --- + --- + ... + ------- )
6631 * 2! 3! 4! n!
6632 * @endcode
6633 *
6634 * The constants we need for using Horner's methods are 1 and 1 / n!.
6635 *
6636 * For very tiny x values, we can get away with f(x) = x * ln 2, because
6637 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
6638 * and can approximate it to be 1.0. For a visual demonstration of this
6639 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
6640 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
6641 *
6642 *
6643 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
6644 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
6645 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
6646 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
6647 * blocks). (The one bit difference is probably an implicit one missing from
6648 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
6649 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
6650 * exponent.
6651 *
6652 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
6653 * successfully reproduced the exact results from an Intel 10980XE, there is
6654 * always a portition of rounding differences. Not going to spend too much time
6655 * on getting this 100% the same, at least not now.
6656 *
6657 * P.S. If someone are really curious about 8087 and its contstants:
6658 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
6659 *
6660 *
6661 * @param pr80Val The exponent value (x), less than 1.0, greater than
6662 * -1.0 and not zero. This can be a normal, denormal
6663 * or pseudo-denormal value.
6664 * @param pr80Result Where to return the result.
6665 * @param fFcw FPU control word.
6666 * @param fFsw FPU status word.
6667 */
6668static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6669{
6670 /* As mentioned above, we can skip the expensive polynomial calculation
6671 as it will be close enough to 1.0 that it makes no difference.
6672
6673 The cutoff point for intel 10980XE is exponents >= -69. Intel
6674 also seems to be using a 67-bit or 68-bit constant value, and we get
6675 a smattering of rounding differences if we go for higher precision. */
6676 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
6677 {
6678 RTUINT256U u256;
6679 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
6680 u256.QWords.qw0 |= 1; /* force #PE */
6681 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
6682 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
6683 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
6684 : 1 - RTFLOAT80U_EXP_BIAS,
6685 fFcw, fFsw);
6686 }
6687 else
6688 {
6689#ifdef IEM_WITH_FLOAT128_FOR_FPU
6690 /* This approach is not good enough for small values, we end up with zero. */
6691 int const fOldRounding = iemFpuF128SetRounding(fFcw);
6692 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
6693 _Float128 rd128Result = powf128(2.0L, rd128Val);
6694 rd128Result -= 1.0L;
6695 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
6696 iemFpuF128RestoreRounding(fOldRounding);
6697
6698# else
6699 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6700 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
6701
6702 /* As mentioned above, enforce 68-bit internal mantissa width to better
6703 match the Intel 10980XE results. */
6704 unsigned const cPrecision = 68;
6705
6706 /* first calculate z = x * ln2 */
6707 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
6708 cPrecision);
6709
6710 /* Then do the polynomial evaluation. */
6711 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
6712 cPrecision, &SoftState);
6713 r = f128_mul(z, r, &SoftState);
6714
6715 /* Output the result. */
6716 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
6717# endif
6718 }
6719 return fFsw;
6720}
6721
6722
6723IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6724{
6725 uint16_t const fFcw = pFpuState->FCW;
6726 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6727
6728 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6729 {
6730 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
6731 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6732 else
6733 {
6734 /* Special case:
6735 2^+1.0 - 1.0 = 1.0
6736 2^-1.0 - 1.0 = -0.5 */
6737 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
6738 && pr80Val->s.uMantissa == RT_BIT_64(63))
6739 {
6740 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
6741 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
6742 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6743 }
6744 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
6745 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
6746 else
6747 pFpuRes->r80Result = *pr80Val;
6748 fFsw |= X86_FSW_PE;
6749 if (!(fFcw & X86_FCW_PM))
6750 fFsw |= X86_FSW_ES | X86_FSW_B;
6751 }
6752 }
6753 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6754 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6755 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6756 pFpuRes->r80Result = *pr80Val;
6757 else if (RTFLOAT80U_IS_INF(pr80Val))
6758 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
6759 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6760 {
6761 fFsw |= X86_FSW_DE;
6762 if (fFcw & X86_FCW_DM)
6763 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6764 else
6765 {
6766 pFpuRes->r80Result = *pr80Val;
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 }
6770 else
6771 {
6772 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6773 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6774 && (fFcw & X86_FCW_IM))
6775 pFpuRes->r80Result = g_r80Indefinite;
6776 else
6777 {
6778 pFpuRes->r80Result = *pr80Val;
6779 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6780 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6781 }
6782 fFsw |= X86_FSW_IE;
6783 if (!(fFcw & X86_FCW_IM))
6784 fFsw |= X86_FSW_ES | X86_FSW_B;
6785 }
6786 pFpuRes->FSW = fFsw;
6787}
6788
6789#endif /* IEM_WITHOUT_ASSEMBLY */
6790
6791IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6792{
6793 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6794}
6795
6796IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6797{
6798 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6799}
6800
6801#ifdef IEM_WITHOUT_ASSEMBLY
6802
6803IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6804{
6805 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6806 pFpuRes->r80Result = *pr80Val;
6807 pFpuRes->r80Result.s.fSign = 0;
6808}
6809
6810
6811IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6812{
6813 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6814 pFpuRes->r80Result = *pr80Val;
6815 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
6816}
6817
6818
6819IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6820{
6821 uint16_t const fFcw = pFpuState->FCW;
6822 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6823
6824 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6825 {
6826 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6827 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
6828
6829 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6830 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6831 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6832 }
6833 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6834 {
6835 fFsw |= X86_FSW_ZE;
6836 if (fFcw & X86_FCW_ZM)
6837 {
6838 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
6839 pFpuResTwo->r80Result2 = *pr80Val;
6840 }
6841 else
6842 {
6843 pFpuResTwo->r80Result2 = *pr80Val;
6844 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6845 }
6846 }
6847 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6848 {
6849 fFsw |= X86_FSW_DE;
6850 if (fFcw & X86_FCW_DM)
6851 {
6852 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6853 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6854 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6855 int32_t iExponent = -16382;
6856 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
6857 {
6858 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
6859 iExponent--;
6860 }
6861
6862 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6863 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
6864 }
6865 else
6866 {
6867 pFpuResTwo->r80Result2 = *pr80Val;
6868 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6869 }
6870 }
6871 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6872 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6873 {
6874 pFpuResTwo->r80Result1 = *pr80Val;
6875 pFpuResTwo->r80Result2 = *pr80Val;
6876 }
6877 else if (RTFLOAT80U_IS_INF(pr80Val))
6878 {
6879 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
6880 pFpuResTwo->r80Result2 = *pr80Val;
6881 }
6882 else
6883 {
6884 if (fFcw & X86_FCW_IM)
6885 {
6886 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6887 pFpuResTwo->r80Result1 = g_r80Indefinite;
6888 else
6889 {
6890 pFpuResTwo->r80Result1 = *pr80Val;
6891 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6892 }
6893 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
6894 }
6895 else
6896 {
6897 pFpuResTwo->r80Result2 = *pr80Val;
6898 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6899 }
6900 fFsw |= X86_FSW_IE;
6901 }
6902 pFpuResTwo->FSW = fFsw;
6903}
6904
6905
6906IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6907 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6908{
6909 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6910 AssertReleaseFailed();
6911}
6912
6913#endif /* IEM_WITHOUT_ASSEMBLY */
6914
6915IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6916 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6917{
6918 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6919}
6920
6921IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6922 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6923{
6924 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6925}
6926
6927#if defined(IEM_WITHOUT_ASSEMBLY)
6928
6929IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6930 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6931{
6932 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6933 AssertReleaseFailed();
6934}
6935
6936#endif /* IEM_WITHOUT_ASSEMBLY */
6937
6938IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6939 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6940{
6941 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6942}
6943
6944IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6945 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6946{
6947 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6948}
6949
6950
6951/*********************************************************************************************************************************
6952* MMX, SSE & AVX *
6953*********************************************************************************************************************************/
6954
6955/*
6956 * MOVSLDUP / VMOVSLDUP
6957 */
6958IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
6959{
6960 puDst->au32[0] = puSrc->au32[0];
6961 puDst->au32[1] = puSrc->au32[0];
6962 puDst->au32[2] = puSrc->au32[2];
6963 puDst->au32[3] = puSrc->au32[2];
6964}
6965
6966#ifdef IEM_WITH_VEX
6967
6968IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
6969{
6970 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
6971 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
6972 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
6973 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
6974 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6975 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6976 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6977 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6978}
6979
6980
6981IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
6982{
6983 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
6984 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
6985 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
6986 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
6987 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
6988 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
6989 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
6990 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
6991}
6992
6993#endif /* IEM_WITH_VEX */
6994
6995
6996/*
6997 * MOVSHDUP / VMOVSHDUP
6998 */
6999IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7000{
7001 puDst->au32[0] = puSrc->au32[1];
7002 puDst->au32[1] = puSrc->au32[1];
7003 puDst->au32[2] = puSrc->au32[3];
7004 puDst->au32[3] = puSrc->au32[3];
7005}
7006
7007#ifdef IEM_WITH_VEX
7008
7009IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7010{
7011 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7012 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7013 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7014 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7015 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7016 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7017 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7018 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7019}
7020
7021
7022IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7023{
7024 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7025 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7026 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7027 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7028 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7029 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7030 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7031 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7032}
7033
7034#endif /* IEM_WITH_VEX */
7035
7036
7037/*
7038 * MOVDDUP / VMOVDDUP
7039 */
7040IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7041{
7042 puDst->au64[0] = uSrc;
7043 puDst->au64[1] = uSrc;
7044}
7045
7046#ifdef IEM_WITH_VEX
7047
7048IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7049{
7050 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7051 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7052 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7053 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7054}
7055
7056IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7057{
7058 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7059 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7060 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7061 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7062}
7063
7064#endif /* IEM_WITH_VEX */
7065
7066
7067/*
7068 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7069 */
7070#ifdef IEM_WITHOUT_ASSEMBLY
7071
7072IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7073{
7074 RT_NOREF(pFpuState);
7075 *puDst &= *puSrc;
7076}
7077
7078
7079IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7080{
7081 RT_NOREF(pFpuState);
7082 puDst->au64[0] &= puSrc->au64[0];
7083 puDst->au64[1] &= puSrc->au64[1];
7084}
7085
7086#endif
7087
7088IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7089 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7090{
7091 RT_NOREF(pExtState);
7092 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7093 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7094}
7095
7096
7097IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7098 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7099{
7100 RT_NOREF(pExtState);
7101 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7102 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7103 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7104 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7105}
7106
7107
7108/*
7109 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7110 */
7111#ifdef IEM_WITHOUT_ASSEMBLY
7112
7113IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7114{
7115 RT_NOREF(pFpuState);
7116 *puDst = ~*puDst & *puSrc;
7117}
7118
7119
7120IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7121{
7122 RT_NOREF(pFpuState);
7123 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7124 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7125}
7126
7127#endif
7128
7129IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7130 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7131{
7132 RT_NOREF(pExtState);
7133 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7134 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7135}
7136
7137
7138IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7139 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7140{
7141 RT_NOREF(pExtState);
7142 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7143 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7144 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7145 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7146}
7147
7148
7149/*
7150 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7151 */
7152#ifdef IEM_WITHOUT_ASSEMBLY
7153
7154IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7155{
7156 RT_NOREF(pFpuState);
7157 *puDst |= *puSrc;
7158}
7159
7160
7161IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7162{
7163 RT_NOREF(pFpuState);
7164 puDst->au64[0] |= puSrc->au64[0];
7165 puDst->au64[1] |= puSrc->au64[1];
7166}
7167
7168#endif
7169
7170IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7171 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7172{
7173 RT_NOREF(pExtState);
7174 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7175 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7176}
7177
7178
7179IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7180 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7181{
7182 RT_NOREF(pExtState);
7183 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7184 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7185 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7186 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7187}
7188
7189
7190/*
7191 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7192 */
7193#ifdef IEM_WITHOUT_ASSEMBLY
7194
7195IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7196{
7197 RT_NOREF(pFpuState);
7198 *puDst ^= *puSrc;
7199}
7200
7201
7202IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7203{
7204 RT_NOREF(pFpuState);
7205 puDst->au64[0] ^= puSrc->au64[0];
7206 puDst->au64[1] ^= puSrc->au64[1];
7207}
7208
7209#endif
7210
7211IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7212 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7213{
7214 RT_NOREF(pExtState);
7215 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7216 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7217}
7218
7219
7220IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7221 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7222{
7223 RT_NOREF(pExtState);
7224 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7225 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7226 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7227 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7228}
7229
7230
7231/*
7232 * PCMPEQB / VPCMPEQB
7233 */
7234#ifdef IEM_WITHOUT_ASSEMBLY
7235
7236IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7237{
7238 RT_NOREF(pFpuState);
7239 RTUINT64U uSrc1 = { *puDst };
7240 RTUINT64U uSrc2 = { *puSrc };
7241 RTUINT64U uDst;
7242 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7243 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7244 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7245 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7246 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7247 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7248 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7249 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7250 *puDst = uDst.u;
7251}
7252
7253
7254IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7255{
7256 RT_NOREF(pFpuState);
7257 RTUINT128U uSrc1 = *puDst;
7258 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7259 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7260 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7261 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7262 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7263 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7264 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7265 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7266 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7267 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7268 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7269 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7270 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7271 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7272 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7273 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7274}
7275
7276#endif
7277
7278IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7279 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7280{
7281 RT_NOREF(pExtState);
7282 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7283 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7284 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7285 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7286 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7287 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7288 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7289 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7290 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7291 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7292 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7293 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7294 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7295 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7296 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7297 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7298}
7299
7300IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7301 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7302{
7303 RT_NOREF(pExtState);
7304 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7305 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7306 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7307 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7308 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7309 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7310 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7311 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7312 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7313 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7314 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7315 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7316 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7317 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7318 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7319 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7320 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7321 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7322 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7323 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7324 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7325 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7326 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7327 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7328 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7329 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7330 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7331 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7332 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7333 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7334 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7335 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7336}
7337
7338
7339/*
7340 * PCMPEQW / VPCMPEQW
7341 */
7342#ifdef IEM_WITHOUT_ASSEMBLY
7343
7344IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7345{
7346 RT_NOREF(pFpuState);
7347 RTUINT64U uSrc1 = { *puDst };
7348 RTUINT64U uSrc2 = { *puSrc };
7349 RTUINT64U uDst;
7350 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7351 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7352 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7353 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7354 *puDst = uDst.u;
7355}
7356
7357
7358IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7359{
7360 RT_NOREF(pFpuState);
7361 RTUINT128U uSrc1 = *puDst;
7362 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7363 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7364 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7365 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7366 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7367 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7368 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7369 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7370}
7371
7372#endif
7373
7374IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7375 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7376{
7377 RT_NOREF(pExtState);
7378 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7379 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7380 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7381 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7382 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7383 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7384 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7385 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7386}
7387
7388IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7389 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7390{
7391 RT_NOREF(pExtState);
7392 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7393 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7394 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7395 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7396 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7397 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7398 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7399 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7400 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7401 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7402 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7403 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7404 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7405 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7406 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7407 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7408}
7409
7410
7411/*
7412 * PCMPEQD / VPCMPEQD.
7413 */
7414#ifdef IEM_WITHOUT_ASSEMBLY
7415
7416IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7417{
7418 RT_NOREF(pFpuState);
7419 RTUINT64U uSrc1 = { *puDst };
7420 RTUINT64U uSrc2 = { *puSrc };
7421 RTUINT64U uDst;
7422 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7423 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7424 *puDst = uDst.u;
7425}
7426
7427
7428IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7429{
7430 RT_NOREF(pFpuState);
7431 RTUINT128U uSrc1 = *puDst;
7432 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7433 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7434 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7435 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7436}
7437
7438#endif /* IEM_WITHOUT_ASSEMBLY */
7439
7440IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7441 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7442{
7443 RT_NOREF(pExtState);
7444 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7445 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7446 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7447 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7448}
7449
7450IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7451 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7452{
7453 RT_NOREF(pExtState);
7454 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7455 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7456 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7457 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7458 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
7459 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
7460 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
7461 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
7462}
7463
7464
7465/*
7466 * PCMPEQQ / VPCMPEQQ.
7467 */
7468IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7469{
7470 RT_NOREF(pFpuState);
7471 RTUINT128U uSrc1 = *puDst;
7472 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
7473 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
7474}
7475
7476IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7477 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7478{
7479 RT_NOREF(pExtState);
7480 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7481 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7482}
7483
7484IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7485 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7486{
7487 RT_NOREF(pExtState);
7488 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7489 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7490 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
7491 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
7492}
7493
7494
7495/*
7496 * PCMPGTB / VPCMPGTB
7497 */
7498#ifdef IEM_WITHOUT_ASSEMBLY
7499
7500IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7501{
7502 RT_NOREF(pFpuState);
7503 RTUINT64U uSrc1 = { *puDst };
7504 RTUINT64U uSrc2 = { *puSrc };
7505 RTUINT64U uDst;
7506 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
7507 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
7508 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
7509 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
7510 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
7511 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
7512 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
7513 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
7514 *puDst = uDst.u;
7515}
7516
7517
7518IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7519{
7520 RT_NOREF(pFpuState);
7521 RTUINT128U uSrc1 = *puDst;
7522 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
7523 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
7524 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
7525 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
7526 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
7527 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
7528 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
7529 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
7530 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
7531 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
7532 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
7533 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
7534 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
7535 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
7536 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
7537 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
7538}
7539
7540#endif
7541
7542IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7543 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7544{
7545 RT_NOREF(pExtState);
7546 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7547 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7548 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7549 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7550 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7551 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7552 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7553 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7554 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7555 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7556 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7557 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7558 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7559 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7560 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7561 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7562}
7563
7564IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7565 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7566{
7567 RT_NOREF(pExtState);
7568 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7569 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7570 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7571 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7572 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7573 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7574 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7575 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7576 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7577 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7578 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7579 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7580 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7581 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7582 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7583 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7584 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
7585 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
7586 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
7587 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
7588 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
7589 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
7590 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
7591 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
7592 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
7593 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
7594 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
7595 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
7596 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
7597 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
7598 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
7599 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
7600}
7601
7602
7603/*
7604 * PCMPGTW / VPCMPGTW
7605 */
7606#ifdef IEM_WITHOUT_ASSEMBLY
7607
7608IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7609{
7610 RT_NOREF(pFpuState);
7611 RTUINT64U uSrc1 = { *puDst };
7612 RTUINT64U uSrc2 = { *puSrc };
7613 RTUINT64U uDst;
7614 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
7615 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
7616 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
7617 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
7618 *puDst = uDst.u;
7619}
7620
7621
7622IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7623{
7624 RT_NOREF(pFpuState);
7625 RTUINT128U uSrc1 = *puDst;
7626 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
7627 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
7628 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
7629 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
7630 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
7631 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
7632 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
7633 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
7634}
7635
7636#endif
7637
7638IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7639 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7640{
7641 RT_NOREF(pExtState);
7642 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7643 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7644 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7645 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7646 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7647 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7648 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7649 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7650}
7651
7652IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7653 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7654{
7655 RT_NOREF(pExtState);
7656 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7657 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7658 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7659 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7660 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7661 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7662 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7663 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7664 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
7665 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
7666 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
7667 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
7668 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
7669 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
7670 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
7671 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
7672}
7673
7674
7675/*
7676 * PCMPGTD / VPCMPGTD.
7677 */
7678#ifdef IEM_WITHOUT_ASSEMBLY
7679
7680IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7681{
7682 RT_NOREF(pFpuState);
7683 RTUINT64U uSrc1 = { *puDst };
7684 RTUINT64U uSrc2 = { *puSrc };
7685 RTUINT64U uDst;
7686 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
7687 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
7688 *puDst = uDst.u;
7689}
7690
7691
7692IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7693{
7694 RT_NOREF(pFpuState);
7695 RTUINT128U uSrc1 = *puDst;
7696 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
7697 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
7698 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
7699 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
7700}
7701
7702#endif /* IEM_WITHOUT_ASSEMBLY */
7703
7704IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7705 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7706{
7707 RT_NOREF(pExtState);
7708 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7709 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7710 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7711 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7712}
7713
7714IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7715 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7716{
7717 RT_NOREF(pExtState);
7718 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7719 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7720 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7721 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7722 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
7723 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
7724 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
7725 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
7726}
7727
7728
7729/*
7730 * PCMPGTQ / VPCMPGTQ.
7731 */
7732IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7733{
7734 RT_NOREF(pFpuState);
7735 RTUINT128U uSrc1 = *puDst;
7736 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
7737 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
7738}
7739
7740IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7741 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7742{
7743 RT_NOREF(pExtState);
7744 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7745 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7746}
7747
7748IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7749 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7750{
7751 RT_NOREF(pExtState);
7752 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7753 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7754 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
7755 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
7756}
7757
7758
7759/*
7760 * PADDB / VPADDB
7761 */
7762#ifdef IEM_WITHOUT_ASSEMBLY
7763
7764IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7765{
7766 RT_NOREF(pFpuState);
7767 RTUINT64U uSrc1 = { *puDst };
7768 RTUINT64U uSrc2 = { *puSrc };
7769 RTUINT64U uDst;
7770 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
7771 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
7772 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
7773 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
7774 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
7775 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
7776 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
7777 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
7778 *puDst = uDst.u;
7779}
7780
7781
7782IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7783{
7784 RT_NOREF(pFpuState);
7785 RTUINT128U uSrc1 = *puDst;
7786 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
7787 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
7788 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
7789 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
7790 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
7791 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
7792 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
7793 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
7794 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
7795 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
7796 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
7797 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
7798 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
7799 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
7800 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
7801 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
7802}
7803
7804#endif
7805
7806
7807IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7808 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7809{
7810 RT_NOREF(pExtState);
7811 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7812 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7813 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7814 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7815 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7816 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7817 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7818 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7819 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7820 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7821 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7822 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7823 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7824 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7825 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7826 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7827}
7828
7829IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7830 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7831{
7832 RT_NOREF(pExtState);
7833 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7834 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7835 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7836 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7837 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7838 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7839 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7840 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7841 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7842 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7843 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7844 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7845 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7846 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7847 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7848 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7849 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
7850 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
7851 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
7852 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
7853 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
7854 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
7855 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
7856 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
7857 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
7858 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
7859 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
7860 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
7861 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
7862 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
7863 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
7864 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
7865}
7866
7867
7868/*
7869 * PADDSB / VPADDSB
7870 */
7871#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
7872 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
7873 ? (uint8_t)(a_iWord) \
7874 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
7875
7876#ifdef IEM_WITHOUT_ASSEMBLY
7877
7878IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7879{
7880 RT_NOREF(pFpuState);
7881 RTUINT64U uSrc1 = { *puDst };
7882 RTUINT64U uSrc2 = { *puSrc };
7883 RTUINT64U uDst;
7884 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
7885 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
7886 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
7887 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
7888 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
7889 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
7890 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
7891 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
7892 *puDst = uDst.u;
7893}
7894
7895
7896IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7897{
7898 RT_NOREF(pFpuState);
7899 RTUINT128U uSrc1 = *puDst;
7900 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
7901 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
7902 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
7903 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
7904 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
7905 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
7906 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
7907 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
7908 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
7909 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
7910 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
7911 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
7912 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
7913 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
7914 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
7915 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
7916}
7917
7918#endif
7919
7920
7921/*
7922 * PADDSB / VPADDSB
7923 */
7924#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
7925 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
7926 ? (uint8_t)(a_uWord) \
7927 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
7928
7929#ifdef IEM_WITHOUT_ASSEMBLY
7930
7931IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7932{
7933 RT_NOREF(pFpuState);
7934 RTUINT64U uSrc1 = { *puDst };
7935 RTUINT64U uSrc2 = { *puSrc };
7936 RTUINT64U uDst;
7937 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
7938 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
7939 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
7940 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
7941 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
7942 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
7943 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
7944 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
7945 *puDst = uDst.u;
7946}
7947
7948
7949IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7950{
7951 RT_NOREF(pFpuState);
7952 RTUINT128U uSrc1 = *puDst;
7953 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
7954 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
7955 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
7956 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
7957 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
7958 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
7959 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
7960 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
7961 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
7962 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
7963 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
7964 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
7965 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
7966 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
7967 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
7968 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
7969}
7970
7971#endif
7972
7973
7974/*
7975 * PADDW / VPADDW
7976 */
7977#ifdef IEM_WITHOUT_ASSEMBLY
7978
7979IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7980{
7981 RT_NOREF(pFpuState);
7982 RTUINT64U uSrc1 = { *puDst };
7983 RTUINT64U uSrc2 = { *puSrc };
7984 RTUINT64U uDst;
7985 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
7986 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
7987 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
7988 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
7989 *puDst = uDst.u;
7990}
7991
7992
7993IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7994{
7995 RT_NOREF(pFpuState);
7996 RTUINT128U uSrc1 = *puDst;
7997 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
7998 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
7999 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8000 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8001 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8002 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8003 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8004 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8005}
8006
8007#endif
8008
8009
8010IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8011 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8012{
8013 RT_NOREF(pExtState);
8014 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8015 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8016 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8017 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8018 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8019 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8020 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8021 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8022}
8023
8024IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8025 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8026{
8027 RT_NOREF(pExtState);
8028 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8029 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8030 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8031 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8032 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8033 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8034 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8035 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8036 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8037 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8038 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8039 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8040 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8041 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8042 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8043 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8044}
8045
8046
8047/*
8048 * PADDSW / VPADDSW
8049 */
8050#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8051 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8052 ? (uint16_t)(a_iDword) \
8053 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8054
8055#ifdef IEM_WITHOUT_ASSEMBLY
8056
8057IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8058{
8059 RT_NOREF(pFpuState);
8060 RTUINT64U uSrc1 = { *puDst };
8061 RTUINT64U uSrc2 = { *puSrc };
8062 RTUINT64U uDst;
8063 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8064 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8065 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8066 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8067 *puDst = uDst.u;
8068}
8069
8070
8071IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8072{
8073 RT_NOREF(pFpuState);
8074 RTUINT128U uSrc1 = *puDst;
8075 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8076 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8077 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8078 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8079 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8080 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8081 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8082 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8083}
8084
8085#endif
8086
8087
8088/*
8089 * PADDUSW / VPADDUSW
8090 */
8091#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8092 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8093 ? (uint16_t)(a_uDword) \
8094 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8095
8096#ifdef IEM_WITHOUT_ASSEMBLY
8097
8098IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8099{
8100 RT_NOREF(pFpuState);
8101 RTUINT64U uSrc1 = { *puDst };
8102 RTUINT64U uSrc2 = { *puSrc };
8103 RTUINT64U uDst;
8104 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8105 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8106 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8107 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8108 *puDst = uDst.u;
8109}
8110
8111
8112IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8113{
8114 RT_NOREF(pFpuState);
8115 RTUINT128U uSrc1 = *puDst;
8116 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8117 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8118 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8119 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8120 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8121 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8122 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8123 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8124}
8125
8126#endif
8127
8128
8129/*
8130 * PADDD / VPADDD.
8131 */
8132#ifdef IEM_WITHOUT_ASSEMBLY
8133
8134IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8135{
8136 RT_NOREF(pFpuState);
8137 RTUINT64U uSrc1 = { *puDst };
8138 RTUINT64U uSrc2 = { *puSrc };
8139 RTUINT64U uDst;
8140 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8141 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8142 *puDst = uDst.u;
8143}
8144
8145
8146IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8147{
8148 RT_NOREF(pFpuState);
8149 RTUINT128U uSrc1 = *puDst;
8150 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8151 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8152 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8153 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8154}
8155
8156#endif /* IEM_WITHOUT_ASSEMBLY */
8157
8158IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8159 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8160{
8161 RT_NOREF(pExtState);
8162 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8163 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8164 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8165 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8166}
8167
8168IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8169 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8170{
8171 RT_NOREF(pExtState);
8172 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8173 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8174 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8175 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8176 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8177 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8178 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8179 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8180}
8181
8182
8183/*
8184 * PADDQ / VPADDQ.
8185 */
8186#ifdef IEM_WITHOUT_ASSEMBLY
8187
8188IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8189{
8190 RT_NOREF(pFpuState);
8191 *puDst = *puDst + *puSrc;
8192}
8193
8194IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8195{
8196 RT_NOREF(pFpuState);
8197 RTUINT128U uSrc1 = *puDst;
8198 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8199 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8200}
8201
8202#endif
8203
8204IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8205 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8206{
8207 RT_NOREF(pExtState);
8208 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8209 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8210}
8211
8212IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8213 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8214{
8215 RT_NOREF(pExtState);
8216 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8217 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8218 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8219 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8220}
8221
8222
8223/*
8224 * PSUBB / VPSUBB
8225 */
8226#ifdef IEM_WITHOUT_ASSEMBLY
8227
8228IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8229{
8230 RT_NOREF(pFpuState);
8231 RTUINT64U uSrc1 = { *puDst };
8232 RTUINT64U uSrc2 = { *puSrc };
8233 RTUINT64U uDst;
8234 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8235 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8236 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8237 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8238 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8239 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8240 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8241 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8242 *puDst = uDst.u;
8243}
8244
8245
8246IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8247{
8248 RT_NOREF(pFpuState);
8249 RTUINT128U uSrc1 = *puDst;
8250 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8251 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8252 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8253 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8254 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8255 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8256 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8257 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8258 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8259 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8260 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8261 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8262 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8263 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8264 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8265 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8266}
8267
8268#endif
8269
8270IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8271 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8272{
8273 RT_NOREF(pExtState);
8274 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8275 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8276 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8277 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8278 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8279 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8280 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8281 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8282 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8283 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8284 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8285 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8286 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8287 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8288 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8289 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8290}
8291
8292IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8293 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8294{
8295 RT_NOREF(pExtState);
8296 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8297 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8298 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8299 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8300 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8301 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8302 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8303 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8304 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8305 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8306 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8307 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8308 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8309 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8310 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8311 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8312 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8313 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8314 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8315 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8316 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8317 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8318 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8319 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8320 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8321 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8322 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8323 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8324 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8325 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8326 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8327 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8328}
8329
8330
8331/*
8332 * PSUBSB / VSUBSB
8333 */
8334#ifdef IEM_WITHOUT_ASSEMBLY
8335
8336IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8337{
8338 RT_NOREF(pFpuState);
8339 RTUINT64U uSrc1 = { *puDst };
8340 RTUINT64U uSrc2 = { *puSrc };
8341 RTUINT64U uDst;
8342 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8343 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8344 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8345 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8346 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8347 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8348 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8349 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8350 *puDst = uDst.u;
8351}
8352
8353
8354IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8355{
8356 RT_NOREF(pFpuState);
8357 RTUINT128U uSrc1 = *puDst;
8358 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8359 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8360 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8361 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8362 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8363 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8364 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8365 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8366 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8367 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8368 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8369 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8370 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8371 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8372 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8373 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8374}
8375
8376#endif
8377
8378
8379/*
8380 * PADDSB / VPADDSB
8381 */
8382#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8383 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8384 ? (uint8_t)(a_uWord) \
8385 : (uint8_t)0 )
8386
8387#ifdef IEM_WITHOUT_ASSEMBLY
8388
8389IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8390{
8391 RT_NOREF(pFpuState);
8392 RTUINT64U uSrc1 = { *puDst };
8393 RTUINT64U uSrc2 = { *puSrc };
8394 RTUINT64U uDst;
8395 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
8396 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
8397 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
8398 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
8399 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
8400 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
8401 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
8402 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
8403 *puDst = uDst.u;
8404}
8405
8406
8407IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8408{
8409 RT_NOREF(pFpuState);
8410 RTUINT128U uSrc1 = *puDst;
8411 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
8412 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
8413 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
8414 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
8415 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
8416 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
8417 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
8418 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
8419 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
8420 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
8421 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
8422 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
8423 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
8424 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
8425 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
8426 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
8427}
8428
8429#endif
8430
8431
8432/*
8433 * PSUBW / VPSUBW
8434 */
8435#ifdef IEM_WITHOUT_ASSEMBLY
8436
8437IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8438{
8439 RT_NOREF(pFpuState);
8440 RTUINT64U uSrc1 = { *puDst };
8441 RTUINT64U uSrc2 = { *puSrc };
8442 RTUINT64U uDst;
8443 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
8444 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
8445 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
8446 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
8447 *puDst = uDst.u;
8448}
8449
8450
8451IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8452{
8453 RT_NOREF(pFpuState);
8454 RTUINT128U uSrc1 = *puDst;
8455 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
8456 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
8457 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
8458 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
8459 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
8460 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
8461 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
8462 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
8463}
8464
8465#endif
8466
8467IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8468 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8469{
8470 RT_NOREF(pExtState);
8471 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8472 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8473 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8474 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8475 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8476 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8477 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8478 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8479}
8480
8481IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8482 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8483{
8484 RT_NOREF(pExtState);
8485 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8486 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8487 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8488 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8489 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8490 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8491 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8492 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8493 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
8494 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
8495 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
8496 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
8497 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
8498 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
8499 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
8500 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
8501}
8502
8503
8504/*
8505 * PSUBSW / VPSUBSW
8506 */
8507#ifdef IEM_WITHOUT_ASSEMBLY
8508
8509IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8510{
8511 RT_NOREF(pFpuState);
8512 RTUINT64U uSrc1 = { *puDst };
8513 RTUINT64U uSrc2 = { *puSrc };
8514 RTUINT64U uDst;
8515 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
8516 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
8517 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
8518 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
8519 *puDst = uDst.u;
8520}
8521
8522
8523IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8524{
8525 RT_NOREF(pFpuState);
8526 RTUINT128U uSrc1 = *puDst;
8527 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
8528 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
8529 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
8530 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
8531 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
8532 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
8533 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
8534 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
8535}
8536
8537#endif
8538
8539
8540/*
8541 * PSUBUSW / VPSUBUSW
8542 */
8543#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
8544 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8545 ? (uint16_t)(a_uDword) \
8546 : (uint16_t)0 )
8547
8548#ifdef IEM_WITHOUT_ASSEMBLY
8549
8550IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8551{
8552 RT_NOREF(pFpuState);
8553 RTUINT64U uSrc1 = { *puDst };
8554 RTUINT64U uSrc2 = { *puSrc };
8555 RTUINT64U uDst;
8556 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
8557 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
8558 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
8559 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
8560 *puDst = uDst.u;
8561}
8562
8563
8564IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8565{
8566 RT_NOREF(pFpuState);
8567 RTUINT128U uSrc1 = *puDst;
8568 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
8569 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
8570 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
8571 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
8572 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
8573 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
8574 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
8575 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
8576}
8577
8578#endif
8579
8580
8581/*
8582 * PSUBD / VPSUBD.
8583 */
8584#ifdef IEM_WITHOUT_ASSEMBLY
8585
8586IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8587{
8588 RT_NOREF(pFpuState);
8589 RTUINT64U uSrc1 = { *puDst };
8590 RTUINT64U uSrc2 = { *puSrc };
8591 RTUINT64U uDst;
8592 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
8593 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
8594 *puDst = uDst.u;
8595}
8596
8597
8598IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8599{
8600 RT_NOREF(pFpuState);
8601 RTUINT128U uSrc1 = *puDst;
8602 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
8603 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
8604 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
8605 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
8606}
8607
8608#endif /* IEM_WITHOUT_ASSEMBLY */
8609
8610IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8611 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8612{
8613 RT_NOREF(pExtState);
8614 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8615 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8616 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8617 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8618}
8619
8620IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8621 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8622{
8623 RT_NOREF(pExtState);
8624 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8625 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8626 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8627 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8628 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
8629 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
8630 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
8631 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
8632}
8633
8634
8635/*
8636 * PSUBQ / VPSUBQ.
8637 */
8638#ifdef IEM_WITHOUT_ASSEMBLY
8639
8640IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8641{
8642 RT_NOREF(pFpuState);
8643 *puDst = *puDst - *puSrc;
8644}
8645
8646IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8647{
8648 RT_NOREF(pFpuState);
8649 RTUINT128U uSrc1 = *puDst;
8650 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
8651 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
8652}
8653
8654#endif
8655
8656IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8657 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8658{
8659 RT_NOREF(pExtState);
8660 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8661 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8662}
8663
8664IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8665 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8666{
8667 RT_NOREF(pExtState);
8668 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8669 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8670 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
8671 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
8672}
8673
8674
8675
8676/*
8677 * PMULLW / VPMULLW
8678 */
8679#ifdef IEM_WITHOUT_ASSEMBLY
8680
8681IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8682{
8683 RT_NOREF(pFpuState);
8684 RTUINT64U uSrc1 = { *puDst };
8685 RTUINT64U uSrc2 = { *puSrc };
8686 RTUINT64U uDst;
8687 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
8688 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
8689 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
8690 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
8691 *puDst = uDst.u;
8692}
8693
8694
8695IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8696{
8697 RT_NOREF(pFpuState);
8698 RTUINT128U uSrc1 = *puDst;
8699 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
8700 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
8701 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
8702 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
8703 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
8704 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
8705 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
8706 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
8707}
8708
8709#endif
8710
8711
8712/*
8713 * PMULHW / VPMULHW
8714 */
8715#ifdef IEM_WITHOUT_ASSEMBLY
8716
8717IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8718{
8719 RT_NOREF(pFpuState);
8720 RTUINT64U uSrc1 = { *puDst };
8721 RTUINT64U uSrc2 = { *puSrc };
8722 RTUINT64U uDst;
8723 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
8724 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
8725 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
8726 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
8727 *puDst = uDst.u;
8728}
8729
8730
8731IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8732{
8733 RT_NOREF(pFpuState);
8734 RTUINT128U uSrc1 = *puDst;
8735 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
8736 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
8737 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
8738 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
8739 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
8740 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
8741 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
8742 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
8743}
8744
8745#endif
8746
8747
8748/*
8749 * PSRLW / VPSRLW
8750 */
8751#ifdef IEM_WITHOUT_ASSEMBLY
8752
8753IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8754{
8755 RTUINT64U uSrc1 = { *puDst };
8756 RTUINT64U uSrc2 = { *puSrc };
8757 RTUINT64U uDst;
8758
8759 if (uSrc2.au64[0] <= 15)
8760 {
8761 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
8762 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
8763 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
8764 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
8765 }
8766 else
8767 {
8768 uDst.au64[0] = 0;
8769 }
8770 *puDst = uDst.u;
8771}
8772
8773
8774IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
8775{
8776 RTUINT64U uSrc1 = { *puDst };
8777 RTUINT64U uDst;
8778
8779 if (uShift <= 15)
8780 {
8781 uDst.au16[0] = uSrc1.au16[0] >> uShift;
8782 uDst.au16[1] = uSrc1.au16[1] >> uShift;
8783 uDst.au16[2] = uSrc1.au16[2] >> uShift;
8784 uDst.au16[3] = uSrc1.au16[3] >> uShift;
8785 }
8786 else
8787 {
8788 uDst.au64[0] = 0;
8789 }
8790 *puDst = uDst.u;
8791}
8792
8793
8794IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8795{
8796 RTUINT128U uSrc1 = *puDst;
8797
8798 if (puSrc->au64[0] <= 15)
8799 {
8800 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
8801 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
8802 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
8803 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
8804 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
8805 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
8806 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
8807 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
8808 }
8809 else
8810 {
8811 puDst->au64[0] = 0;
8812 puDst->au64[1] = 0;
8813 }
8814}
8815
8816IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
8817{
8818 RTUINT128U uSrc1 = *puDst;
8819
8820 if (uShift <= 15)
8821 {
8822 puDst->au16[0] = uSrc1.au16[0] >> uShift;
8823 puDst->au16[1] = uSrc1.au16[1] >> uShift;
8824 puDst->au16[2] = uSrc1.au16[2] >> uShift;
8825 puDst->au16[3] = uSrc1.au16[3] >> uShift;
8826 puDst->au16[4] = uSrc1.au16[4] >> uShift;
8827 puDst->au16[5] = uSrc1.au16[5] >> uShift;
8828 puDst->au16[6] = uSrc1.au16[6] >> uShift;
8829 puDst->au16[7] = uSrc1.au16[7] >> uShift;
8830 }
8831 else
8832 {
8833 puDst->au64[0] = 0;
8834 puDst->au64[1] = 0;
8835 }
8836}
8837
8838#endif
8839
8840
8841/*
8842 * PSRAW / VPSRAW
8843 */
8844#ifdef IEM_WITHOUT_ASSEMBLY
8845
8846IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8847{
8848 RTUINT64U uSrc1 = { *puDst };
8849 RTUINT64U uSrc2 = { *puSrc };
8850 RTUINT64U uDst;
8851
8852 if (uSrc2.au64[0] <= 15)
8853 {
8854 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
8855 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
8856 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
8857 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
8858 }
8859 else
8860 {
8861 uDst.au64[0] = 0;
8862 }
8863 *puDst = uDst.u;
8864}
8865
8866
8867IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
8868{
8869 RTUINT64U uSrc1 = { *puDst };
8870 RTUINT64U uDst;
8871
8872 if (uShift <= 15)
8873 {
8874 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
8875 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
8876 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
8877 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
8878 }
8879 else
8880 {
8881 uDst.au64[0] = 0;
8882 }
8883 *puDst = uDst.u;
8884}
8885
8886
8887IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8888{
8889 RTUINT128U uSrc1 = *puDst;
8890
8891 if (puSrc->au64[0] <= 15)
8892 {
8893 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
8894 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
8895 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
8896 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
8897 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
8898 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
8899 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
8900 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
8901 }
8902 else
8903 {
8904 puDst->au64[0] = 0;
8905 puDst->au64[1] = 0;
8906 }
8907}
8908
8909IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
8910{
8911 RTUINT128U uSrc1 = *puDst;
8912
8913 if (uShift <= 15)
8914 {
8915 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
8916 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
8917 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
8918 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
8919 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
8920 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
8921 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
8922 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
8923 }
8924 else
8925 {
8926 puDst->au64[0] = 0;
8927 puDst->au64[1] = 0;
8928 }
8929}
8930
8931#endif
8932
8933
8934/*
8935 * PSLLW / VPSLLW
8936 */
8937#ifdef IEM_WITHOUT_ASSEMBLY
8938
8939IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8940{
8941 RTUINT64U uSrc1 = { *puDst };
8942 RTUINT64U uSrc2 = { *puSrc };
8943 RTUINT64U uDst;
8944
8945 if (uSrc2.au64[0] <= 15)
8946 {
8947 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
8948 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
8949 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
8950 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
8951 }
8952 else
8953 {
8954 uDst.au64[0] = 0;
8955 }
8956 *puDst = uDst.u;
8957}
8958
8959
8960IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
8961{
8962 RTUINT64U uSrc1 = { *puDst };
8963 RTUINT64U uDst;
8964
8965 if (uShift <= 15)
8966 {
8967 uDst.au16[0] = uSrc1.au16[0] << uShift;
8968 uDst.au16[1] = uSrc1.au16[1] << uShift;
8969 uDst.au16[2] = uSrc1.au16[2] << uShift;
8970 uDst.au16[3] = uSrc1.au16[3] << uShift;
8971 }
8972 else
8973 {
8974 uDst.au64[0] = 0;
8975 }
8976 *puDst = uDst.u;
8977}
8978
8979
8980IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8981{
8982 RTUINT128U uSrc1 = *puDst;
8983
8984 if (puSrc->au64[0] <= 15)
8985 {
8986 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
8987 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
8988 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
8989 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
8990 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
8991 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
8992 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
8993 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
8994 }
8995 else
8996 {
8997 puDst->au64[0] = 0;
8998 puDst->au64[1] = 0;
8999 }
9000}
9001
9002IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9003{
9004 RTUINT128U uSrc1 = *puDst;
9005
9006 if (uShift <= 15)
9007 {
9008 puDst->au16[0] = uSrc1.au16[0] << uShift;
9009 puDst->au16[1] = uSrc1.au16[1] << uShift;
9010 puDst->au16[2] = uSrc1.au16[2] << uShift;
9011 puDst->au16[3] = uSrc1.au16[3] << uShift;
9012 puDst->au16[4] = uSrc1.au16[4] << uShift;
9013 puDst->au16[5] = uSrc1.au16[5] << uShift;
9014 puDst->au16[6] = uSrc1.au16[6] << uShift;
9015 puDst->au16[7] = uSrc1.au16[7] << uShift;
9016 }
9017 else
9018 {
9019 puDst->au64[0] = 0;
9020 puDst->au64[1] = 0;
9021 }
9022}
9023
9024#endif
9025
9026
9027/*
9028 * PSRLD / VPSRLD
9029 */
9030#ifdef IEM_WITHOUT_ASSEMBLY
9031
9032IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9033{
9034 RTUINT64U uSrc1 = { *puDst };
9035 RTUINT64U uSrc2 = { *puSrc };
9036 RTUINT64U uDst;
9037
9038 if (uSrc2.au64[0] <= 31)
9039 {
9040 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9041 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9042 }
9043 else
9044 {
9045 uDst.au64[0] = 0;
9046 }
9047 *puDst = uDst.u;
9048}
9049
9050
9051IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9052{
9053 RTUINT64U uSrc1 = { *puDst };
9054 RTUINT64U uDst;
9055
9056 if (uShift <= 31)
9057 {
9058 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9059 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9060 }
9061 else
9062 {
9063 uDst.au64[0] = 0;
9064 }
9065 *puDst = uDst.u;
9066}
9067
9068
9069IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9070{
9071 RTUINT128U uSrc1 = *puDst;
9072
9073 if (puSrc->au64[0] <= 31)
9074 {
9075 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9076 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9077 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9078 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9079 }
9080 else
9081 {
9082 puDst->au64[0] = 0;
9083 puDst->au64[1] = 0;
9084 }
9085}
9086
9087IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9088{
9089 RTUINT128U uSrc1 = *puDst;
9090
9091 if (uShift <= 31)
9092 {
9093 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9094 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9095 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9096 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9097 }
9098 else
9099 {
9100 puDst->au64[0] = 0;
9101 puDst->au64[1] = 0;
9102 }
9103}
9104
9105#endif
9106
9107
9108/*
9109 * PSRAD / VPSRAD
9110 */
9111#ifdef IEM_WITHOUT_ASSEMBLY
9112
9113IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9114{
9115 RTUINT64U uSrc1 = { *puDst };
9116 RTUINT64U uSrc2 = { *puSrc };
9117 RTUINT64U uDst;
9118
9119 if (uSrc2.au64[0] <= 31)
9120 {
9121 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9122 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9123 }
9124 else
9125 {
9126 uDst.au64[0] = 0;
9127 }
9128 *puDst = uDst.u;
9129}
9130
9131
9132IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9133{
9134 RTUINT64U uSrc1 = { *puDst };
9135 RTUINT64U uDst;
9136
9137 if (uShift <= 31)
9138 {
9139 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9140 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9141 }
9142 else
9143 {
9144 uDst.au64[0] = 0;
9145 }
9146 *puDst = uDst.u;
9147}
9148
9149
9150IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9151{
9152 RTUINT128U uSrc1 = *puDst;
9153
9154 if (puSrc->au64[0] <= 31)
9155 {
9156 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9157 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9158 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9159 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9160 }
9161 else
9162 {
9163 puDst->au64[0] = 0;
9164 puDst->au64[1] = 0;
9165 }
9166}
9167
9168IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9169{
9170 RTUINT128U uSrc1 = *puDst;
9171
9172 if (uShift <= 31)
9173 {
9174 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9175 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9176 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9177 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9178 }
9179 else
9180 {
9181 puDst->au64[0] = 0;
9182 puDst->au64[1] = 0;
9183 }
9184}
9185
9186#endif
9187
9188
9189/*
9190 * PSLLD / VPSLLD
9191 */
9192#ifdef IEM_WITHOUT_ASSEMBLY
9193
9194IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9195{
9196 RTUINT64U uSrc1 = { *puDst };
9197 RTUINT64U uSrc2 = { *puSrc };
9198 RTUINT64U uDst;
9199
9200 if (uSrc2.au64[0] <= 31)
9201 {
9202 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9203 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9204 }
9205 else
9206 {
9207 uDst.au64[0] = 0;
9208 }
9209 *puDst = uDst.u;
9210}
9211
9212
9213IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9214{
9215 RTUINT64U uSrc1 = { *puDst };
9216 RTUINT64U uDst;
9217
9218 if (uShift <= 31)
9219 {
9220 uDst.au32[0] = uSrc1.au32[0] << uShift;
9221 uDst.au32[1] = uSrc1.au32[1] << uShift;
9222 }
9223 else
9224 {
9225 uDst.au64[0] = 0;
9226 }
9227 *puDst = uDst.u;
9228}
9229
9230
9231IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9232{
9233 RTUINT128U uSrc1 = *puDst;
9234
9235 if (puSrc->au64[0] <= 31)
9236 {
9237 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
9238 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
9239 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
9240 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
9241 }
9242 else
9243 {
9244 puDst->au64[0] = 0;
9245 puDst->au64[1] = 0;
9246 }
9247}
9248
9249IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9250{
9251 RTUINT128U uSrc1 = *puDst;
9252
9253 if (uShift <= 31)
9254 {
9255 puDst->au32[0] = uSrc1.au32[0] << uShift;
9256 puDst->au32[1] = uSrc1.au32[1] << uShift;
9257 puDst->au32[2] = uSrc1.au32[2] << uShift;
9258 puDst->au32[3] = uSrc1.au32[3] << uShift;
9259 }
9260 else
9261 {
9262 puDst->au64[0] = 0;
9263 puDst->au64[1] = 0;
9264 }
9265}
9266
9267#endif
9268
9269
9270/*
9271 * PSRLQ / VPSRLQ
9272 */
9273#ifdef IEM_WITHOUT_ASSEMBLY
9274
9275IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9276{
9277 RTUINT64U uSrc1 = { *puDst };
9278 RTUINT64U uSrc2 = { *puSrc };
9279 RTUINT64U uDst;
9280
9281 if (uSrc2.au64[0] <= 63)
9282 {
9283 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
9284 }
9285 else
9286 {
9287 uDst.au64[0] = 0;
9288 }
9289 *puDst = uDst.u;
9290}
9291
9292
9293IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9294{
9295 RTUINT64U uSrc1 = { *puDst };
9296 RTUINT64U uDst;
9297
9298 if (uShift <= 63)
9299 {
9300 uDst.au64[0] = uSrc1.au64[0] >> uShift;
9301 }
9302 else
9303 {
9304 uDst.au64[0] = 0;
9305 }
9306 *puDst = uDst.u;
9307}
9308
9309
9310IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9311{
9312 RTUINT128U uSrc1 = *puDst;
9313
9314 if (puSrc->au64[0] <= 63)
9315 {
9316 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
9317 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
9318 }
9319 else
9320 {
9321 puDst->au64[0] = 0;
9322 puDst->au64[1] = 0;
9323 }
9324}
9325
9326IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9327{
9328 RTUINT128U uSrc1 = *puDst;
9329
9330 if (uShift <= 63)
9331 {
9332 puDst->au64[0] = uSrc1.au64[0] >> uShift;
9333 puDst->au64[1] = uSrc1.au64[1] >> uShift;
9334 }
9335 else
9336 {
9337 puDst->au64[0] = 0;
9338 puDst->au64[1] = 0;
9339 }
9340}
9341
9342#endif
9343
9344
9345/*
9346 * PSLLQ / VPSLLQ
9347 */
9348#ifdef IEM_WITHOUT_ASSEMBLY
9349
9350IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9351{
9352 RTUINT64U uSrc1 = { *puDst };
9353 RTUINT64U uSrc2 = { *puSrc };
9354 RTUINT64U uDst;
9355
9356 if (uSrc2.au64[0] <= 63)
9357 {
9358 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
9359 }
9360 else
9361 {
9362 uDst.au64[0] = 0;
9363 }
9364 *puDst = uDst.u;
9365}
9366
9367
9368IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9369{
9370 RTUINT64U uSrc1 = { *puDst };
9371 RTUINT64U uDst;
9372
9373 if (uShift <= 63)
9374 {
9375 uDst.au64[0] = uSrc1.au64[0] << uShift;
9376 }
9377 else
9378 {
9379 uDst.au64[0] = 0;
9380 }
9381 *puDst = uDst.u;
9382}
9383
9384
9385IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9386{
9387 RTUINT128U uSrc1 = *puDst;
9388
9389 if (puSrc->au64[0] <= 63)
9390 {
9391 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
9392 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
9393 }
9394 else
9395 {
9396 puDst->au64[0] = 0;
9397 puDst->au64[1] = 0;
9398 }
9399}
9400
9401IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9402{
9403 RTUINT128U uSrc1 = *puDst;
9404
9405 if (uShift <= 63)
9406 {
9407 puDst->au64[0] = uSrc1.au64[0] << uShift;
9408 puDst->au64[1] = uSrc1.au64[1] << uShift;
9409 }
9410 else
9411 {
9412 puDst->au64[0] = 0;
9413 puDst->au64[1] = 0;
9414 }
9415}
9416
9417#endif
9418
9419
9420/*
9421 * PSRLDQ / VPSRLDQ
9422 */
9423#ifdef IEM_WITHOUT_ASSEMBLY
9424
9425IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9426{
9427 RTUINT128U uSrc1 = *puDst;
9428
9429 if (uShift < 16)
9430 {
9431 int i;
9432
9433 for (i = 0; i < 16 - uShift; ++i)
9434 puDst->au8[i] = uSrc1.au8[i + uShift];
9435 for (i = 16 - uShift; i < 16; ++i)
9436 puDst->au8[i] = 0;
9437 }
9438 else
9439 {
9440 puDst->au64[0] = 0;
9441 puDst->au64[1] = 0;
9442 }
9443}
9444
9445#endif
9446
9447
9448/*
9449 * PSLLDQ / VPSLLDQ
9450 */
9451#ifdef IEM_WITHOUT_ASSEMBLY
9452
9453IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9454{
9455 RTUINT128U uSrc1 = *puDst;
9456
9457 if (uShift < 16)
9458 {
9459 int i;
9460
9461 for (i = 0; i < uShift; ++i)
9462 puDst->au8[i] = 0;
9463 for (i = uShift; i < 16; ++i)
9464 puDst->au8[i] = uSrc1.au8[i - uShift];
9465 }
9466 else
9467 {
9468 puDst->au64[0] = 0;
9469 puDst->au64[1] = 0;
9470 }
9471}
9472
9473#endif
9474
9475
9476/*
9477 * PMADDWD / VPMADDWD
9478 */
9479#ifdef IEM_WITHOUT_ASSEMBLY
9480
9481IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9482{
9483 RTUINT64U uSrc1 = { *puDst };
9484 RTUINT64U uSrc2 = { *puSrc };
9485 RTUINT64U uDst;
9486
9487 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
9488 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
9489 *puDst = uDst.u;
9490 RT_NOREF(pFpuState);
9491}
9492
9493
9494IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9495{
9496 RTUINT128U uSrc1 = *puDst;
9497
9498 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
9499 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
9500 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
9501 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
9502 RT_NOREF(pFpuState);
9503}
9504
9505#endif
9506
9507
9508/*
9509 * PMAXUB / VPMAXUB
9510 */
9511#ifdef IEM_WITHOUT_ASSEMBLY
9512
9513IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9514{
9515 RTUINT64U uSrc1 = { *puDst };
9516 RTUINT64U uSrc2 = { *puSrc };
9517 RTUINT64U uDst;
9518
9519 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
9520 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
9521 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
9522 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
9523 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
9524 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
9525 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
9526 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
9527 *puDst = uDst.u;
9528 RT_NOREF(pFpuState);
9529}
9530
9531
9532IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9533{
9534 RTUINT128U uSrc1 = *puDst;
9535
9536 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
9537 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
9538 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
9539 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
9540 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
9541 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
9542 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
9543 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
9544 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
9545 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
9546 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
9547 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
9548 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
9549 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
9550 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
9551 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
9552 RT_NOREF(pFpuState);
9553}
9554
9555#endif
9556
9557
9558/*
9559 * PMINUB / VPMINUB
9560 */
9561#ifdef IEM_WITHOUT_ASSEMBLY
9562
9563IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9564{
9565 RTUINT64U uSrc1 = { *puDst };
9566 RTUINT64U uSrc2 = { *puSrc };
9567 RTUINT64U uDst;
9568
9569 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
9570 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
9571 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
9572 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
9573 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
9574 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
9575 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
9576 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
9577 *puDst = uDst.u;
9578 RT_NOREF(pFpuState);
9579}
9580
9581
9582IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9583{
9584 RTUINT128U uSrc1 = *puDst;
9585
9586 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
9587 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
9588 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
9589 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
9590 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
9591 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
9592 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
9593 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
9594 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
9595 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
9596 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
9597 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
9598 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
9599 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
9600 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
9601 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
9602 RT_NOREF(pFpuState);
9603}
9604
9605#endif
9606
9607
9608/*
9609 * PMOVMSKB / VPMOVMSKB
9610 */
9611#ifdef IEM_WITHOUT_ASSEMBLY
9612
9613IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
9614{
9615 /* The the most signficant bit from each byte and store them in the given general purpose register. */
9616 uint64_t const uSrc = *pu64Src;
9617 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
9618 | ((uSrc >> (15-1)) & RT_BIT_64(1))
9619 | ((uSrc >> (23-2)) & RT_BIT_64(2))
9620 | ((uSrc >> (31-3)) & RT_BIT_64(3))
9621 | ((uSrc >> (39-4)) & RT_BIT_64(4))
9622 | ((uSrc >> (47-5)) & RT_BIT_64(5))
9623 | ((uSrc >> (55-6)) & RT_BIT_64(6))
9624 | ((uSrc >> (63-7)) & RT_BIT_64(7));
9625}
9626
9627
9628IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
9629{
9630 /* The the most signficant bit from each byte and store them in the given general purpose register. */
9631 uint64_t const uSrc0 = pu128Src->QWords.qw0;
9632 uint64_t const uSrc1 = pu128Src->QWords.qw1;
9633 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
9634 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
9635 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
9636 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
9637 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
9638 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
9639 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
9640 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
9641 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
9642 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
9643 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
9644 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
9645 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
9646 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
9647 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
9648 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
9649}
9650
9651#endif
9652
9653IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
9654{
9655 /* The the most signficant bit from each byte and store them in the given general purpose register. */
9656 uint64_t const uSrc0 = puSrc->QWords.qw0;
9657 uint64_t const uSrc1 = puSrc->QWords.qw1;
9658 uint64_t const uSrc2 = puSrc->QWords.qw2;
9659 uint64_t const uSrc3 = puSrc->QWords.qw3;
9660 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
9661 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
9662 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
9663 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
9664 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
9665 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
9666 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
9667 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
9668 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
9669 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
9670 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
9671 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
9672 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
9673 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
9674 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
9675 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
9676 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
9677 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
9678 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
9679 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
9680 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
9681 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
9682 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
9683 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
9684 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
9685 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
9686 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
9687 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
9688 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
9689 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
9690 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
9691 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
9692}
9693
9694
9695/*
9696 * [V]PSHUFB
9697 */
9698
9699IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9700{
9701 RTUINT64U const uSrc = { *puSrc };
9702 RTUINT64U const uDstIn = { *puDst };
9703 ASMCompilerBarrier();
9704 RTUINT64U uDstOut = { 0 };
9705 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
9706 {
9707 uint8_t idxSrc = uSrc.au8[iByte];
9708 if (!(idxSrc & 0x80))
9709 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
9710 }
9711 *puDst = uDstOut.u;
9712 RT_NOREF(pFpuState);
9713}
9714
9715
9716IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9717{
9718 RTUINT128U const uSrc = *puSrc;
9719 RTUINT128U const uDstIn = *puDst;
9720 ASMCompilerBarrier();
9721 puDst->au64[0] = 0;
9722 puDst->au64[1] = 0;
9723 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
9724 {
9725 uint8_t idxSrc = uSrc.au8[iByte];
9726 if (!(idxSrc & 0x80))
9727 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
9728 }
9729 RT_NOREF(pFpuState);
9730}
9731
9732
9733IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9734 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9735{
9736 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
9737 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
9738 ASMCompilerBarrier();
9739 puDst->au64[0] = 0;
9740 puDst->au64[1] = 0;
9741 for (unsigned iByte = 0; iByte < 16; iByte++)
9742 {
9743 uint8_t idxSrc = uSrc2.au8[iByte];
9744 if (!(idxSrc & 0x80))
9745 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
9746 }
9747 RT_NOREF(pExtState);
9748}
9749
9750
9751IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9752 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9753{
9754 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
9755 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
9756 ASMCompilerBarrier();
9757 puDst->au64[0] = 0;
9758 puDst->au64[1] = 0;
9759 puDst->au64[2] = 0;
9760 puDst->au64[3] = 0;
9761 for (unsigned iByte = 0; iByte < 16; iByte++)
9762 {
9763 uint8_t idxSrc = uSrc2.au8[iByte];
9764 if (!(idxSrc & 0x80))
9765 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
9766 }
9767 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
9768 {
9769 uint8_t idxSrc = uSrc2.au8[iByte];
9770 if (!(idxSrc & 0x80))
9771 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
9772 }
9773 RT_NOREF(pExtState);
9774}
9775
9776
9777/*
9778 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
9779 */
9780#ifdef IEM_WITHOUT_ASSEMBLY
9781
9782IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
9783{
9784 uint64_t const uSrc = *puSrc;
9785 ASMCompilerBarrier();
9786 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
9787 uSrc >> (((bEvil >> 2) & 3) * 16),
9788 uSrc >> (((bEvil >> 4) & 3) * 16),
9789 uSrc >> (((bEvil >> 6) & 3) * 16));
9790}
9791
9792
9793IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
9794{
9795 puDst->QWords.qw0 = puSrc->QWords.qw0;
9796 uint64_t const uSrc = puSrc->QWords.qw1;
9797 ASMCompilerBarrier();
9798 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
9799 uSrc >> (((bEvil >> 2) & 3) * 16),
9800 uSrc >> (((bEvil >> 4) & 3) * 16),
9801 uSrc >> (((bEvil >> 6) & 3) * 16));
9802}
9803
9804#endif
9805
9806IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
9807{
9808 puDst->QWords.qw0 = puSrc->QWords.qw0;
9809 uint64_t const uSrc1 = puSrc->QWords.qw1;
9810 puDst->QWords.qw2 = puSrc->QWords.qw2;
9811 uint64_t const uSrc3 = puSrc->QWords.qw3;
9812 ASMCompilerBarrier();
9813 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
9814 uSrc1 >> (((bEvil >> 2) & 3) * 16),
9815 uSrc1 >> (((bEvil >> 4) & 3) * 16),
9816 uSrc1 >> (((bEvil >> 6) & 3) * 16));
9817 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
9818 uSrc3 >> (((bEvil >> 2) & 3) * 16),
9819 uSrc3 >> (((bEvil >> 4) & 3) * 16),
9820 uSrc3 >> (((bEvil >> 6) & 3) * 16));
9821}
9822
9823#ifdef IEM_WITHOUT_ASSEMBLY
9824IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
9825{
9826 puDst->QWords.qw1 = puSrc->QWords.qw1;
9827 uint64_t const uSrc = puSrc->QWords.qw0;
9828 ASMCompilerBarrier();
9829 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
9830 uSrc >> (((bEvil >> 2) & 3) * 16),
9831 uSrc >> (((bEvil >> 4) & 3) * 16),
9832 uSrc >> (((bEvil >> 6) & 3) * 16));
9833
9834}
9835#endif
9836
9837
9838IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
9839{
9840 puDst->QWords.qw3 = puSrc->QWords.qw3;
9841 uint64_t const uSrc2 = puSrc->QWords.qw2;
9842 puDst->QWords.qw1 = puSrc->QWords.qw1;
9843 uint64_t const uSrc0 = puSrc->QWords.qw0;
9844 ASMCompilerBarrier();
9845 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
9846 uSrc0 >> (((bEvil >> 2) & 3) * 16),
9847 uSrc0 >> (((bEvil >> 4) & 3) * 16),
9848 uSrc0 >> (((bEvil >> 6) & 3) * 16));
9849 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
9850 uSrc2 >> (((bEvil >> 2) & 3) * 16),
9851 uSrc2 >> (((bEvil >> 4) & 3) * 16),
9852 uSrc2 >> (((bEvil >> 6) & 3) * 16));
9853
9854}
9855
9856
9857#ifdef IEM_WITHOUT_ASSEMBLY
9858IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
9859{
9860 RTUINT128U const uSrc = *puSrc;
9861 ASMCompilerBarrier();
9862 puDst->au32[0] = uSrc.au32[bEvil & 3];
9863 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
9864 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
9865 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
9866}
9867#endif
9868
9869
9870IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
9871{
9872 RTUINT256U const uSrc = *puSrc;
9873 ASMCompilerBarrier();
9874 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
9875 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
9876 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
9877 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
9878 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
9879 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
9880 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
9881 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
9882}
9883
9884
9885/*
9886 * PUNPCKHBW - high bytes -> words
9887 */
9888#ifdef IEM_WITHOUT_ASSEMBLY
9889
9890IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9891{
9892 RTUINT64U const uSrc2 = { *puSrc };
9893 RTUINT64U const uSrc1 = { *puDst };
9894 ASMCompilerBarrier();
9895 RTUINT64U uDstOut;
9896 uDstOut.au8[0] = uSrc1.au8[4];
9897 uDstOut.au8[1] = uSrc2.au8[4];
9898 uDstOut.au8[2] = uSrc1.au8[5];
9899 uDstOut.au8[3] = uSrc2.au8[5];
9900 uDstOut.au8[4] = uSrc1.au8[6];
9901 uDstOut.au8[5] = uSrc2.au8[6];
9902 uDstOut.au8[6] = uSrc1.au8[7];
9903 uDstOut.au8[7] = uSrc2.au8[7];
9904 *puDst = uDstOut.u;
9905}
9906
9907
9908IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9909{
9910 RTUINT128U const uSrc2 = *puSrc;
9911 RTUINT128U const uSrc1 = *puDst;
9912 ASMCompilerBarrier();
9913 RTUINT128U uDstOut;
9914 uDstOut.au8[ 0] = uSrc1.au8[ 8];
9915 uDstOut.au8[ 1] = uSrc2.au8[ 8];
9916 uDstOut.au8[ 2] = uSrc1.au8[ 9];
9917 uDstOut.au8[ 3] = uSrc2.au8[ 9];
9918 uDstOut.au8[ 4] = uSrc1.au8[10];
9919 uDstOut.au8[ 5] = uSrc2.au8[10];
9920 uDstOut.au8[ 6] = uSrc1.au8[11];
9921 uDstOut.au8[ 7] = uSrc2.au8[11];
9922 uDstOut.au8[ 8] = uSrc1.au8[12];
9923 uDstOut.au8[ 9] = uSrc2.au8[12];
9924 uDstOut.au8[10] = uSrc1.au8[13];
9925 uDstOut.au8[11] = uSrc2.au8[13];
9926 uDstOut.au8[12] = uSrc1.au8[14];
9927 uDstOut.au8[13] = uSrc2.au8[14];
9928 uDstOut.au8[14] = uSrc1.au8[15];
9929 uDstOut.au8[15] = uSrc2.au8[15];
9930 *puDst = uDstOut;
9931}
9932
9933#endif
9934
9935IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9936{
9937 RTUINT128U const uSrc2 = *puSrc2;
9938 RTUINT128U const uSrc1 = *puSrc1;
9939 ASMCompilerBarrier();
9940 RTUINT128U uDstOut;
9941 uDstOut.au8[ 0] = uSrc1.au8[ 8];
9942 uDstOut.au8[ 1] = uSrc2.au8[ 8];
9943 uDstOut.au8[ 2] = uSrc1.au8[ 9];
9944 uDstOut.au8[ 3] = uSrc2.au8[ 9];
9945 uDstOut.au8[ 4] = uSrc1.au8[10];
9946 uDstOut.au8[ 5] = uSrc2.au8[10];
9947 uDstOut.au8[ 6] = uSrc1.au8[11];
9948 uDstOut.au8[ 7] = uSrc2.au8[11];
9949 uDstOut.au8[ 8] = uSrc1.au8[12];
9950 uDstOut.au8[ 9] = uSrc2.au8[12];
9951 uDstOut.au8[10] = uSrc1.au8[13];
9952 uDstOut.au8[11] = uSrc2.au8[13];
9953 uDstOut.au8[12] = uSrc1.au8[14];
9954 uDstOut.au8[13] = uSrc2.au8[14];
9955 uDstOut.au8[14] = uSrc1.au8[15];
9956 uDstOut.au8[15] = uSrc2.au8[15];
9957 *puDst = uDstOut;
9958}
9959
9960
9961IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9962{
9963 RTUINT256U const uSrc2 = *puSrc2;
9964 RTUINT256U const uSrc1 = *puSrc1;
9965 ASMCompilerBarrier();
9966 RTUINT256U uDstOut;
9967 uDstOut.au8[ 0] = uSrc1.au8[ 8];
9968 uDstOut.au8[ 1] = uSrc2.au8[ 8];
9969 uDstOut.au8[ 2] = uSrc1.au8[ 9];
9970 uDstOut.au8[ 3] = uSrc2.au8[ 9];
9971 uDstOut.au8[ 4] = uSrc1.au8[10];
9972 uDstOut.au8[ 5] = uSrc2.au8[10];
9973 uDstOut.au8[ 6] = uSrc1.au8[11];
9974 uDstOut.au8[ 7] = uSrc2.au8[11];
9975 uDstOut.au8[ 8] = uSrc1.au8[12];
9976 uDstOut.au8[ 9] = uSrc2.au8[12];
9977 uDstOut.au8[10] = uSrc1.au8[13];
9978 uDstOut.au8[11] = uSrc2.au8[13];
9979 uDstOut.au8[12] = uSrc1.au8[14];
9980 uDstOut.au8[13] = uSrc2.au8[14];
9981 uDstOut.au8[14] = uSrc1.au8[15];
9982 uDstOut.au8[15] = uSrc2.au8[15];
9983 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
9984 uDstOut.au8[16] = uSrc1.au8[24];
9985 uDstOut.au8[17] = uSrc2.au8[24];
9986 uDstOut.au8[18] = uSrc1.au8[25];
9987 uDstOut.au8[19] = uSrc2.au8[25];
9988 uDstOut.au8[20] = uSrc1.au8[26];
9989 uDstOut.au8[21] = uSrc2.au8[26];
9990 uDstOut.au8[22] = uSrc1.au8[27];
9991 uDstOut.au8[23] = uSrc2.au8[27];
9992 uDstOut.au8[24] = uSrc1.au8[28];
9993 uDstOut.au8[25] = uSrc2.au8[28];
9994 uDstOut.au8[26] = uSrc1.au8[29];
9995 uDstOut.au8[27] = uSrc2.au8[29];
9996 uDstOut.au8[28] = uSrc1.au8[30];
9997 uDstOut.au8[29] = uSrc2.au8[30];
9998 uDstOut.au8[30] = uSrc1.au8[31];
9999 uDstOut.au8[31] = uSrc2.au8[31];
10000 *puDst = uDstOut;
10001}
10002
10003
10004/*
10005 * PUNPCKHBW - high words -> dwords
10006 */
10007#ifdef IEM_WITHOUT_ASSEMBLY
10008
10009IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
10010{
10011 RTUINT64U const uSrc2 = { *puSrc };
10012 RTUINT64U const uSrc1 = { *puDst };
10013 ASMCompilerBarrier();
10014 RTUINT64U uDstOut;
10015 uDstOut.au16[0] = uSrc1.au16[2];
10016 uDstOut.au16[1] = uSrc2.au16[2];
10017 uDstOut.au16[2] = uSrc1.au16[3];
10018 uDstOut.au16[3] = uSrc2.au16[3];
10019 *puDst = uDstOut.u;
10020}
10021
10022
10023IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10024{
10025 RTUINT128U const uSrc2 = *puSrc;
10026 RTUINT128U const uSrc1 = *puDst;
10027 ASMCompilerBarrier();
10028 RTUINT128U uDstOut;
10029 uDstOut.au16[0] = uSrc1.au16[4];
10030 uDstOut.au16[1] = uSrc2.au16[4];
10031 uDstOut.au16[2] = uSrc1.au16[5];
10032 uDstOut.au16[3] = uSrc2.au16[5];
10033 uDstOut.au16[4] = uSrc1.au16[6];
10034 uDstOut.au16[5] = uSrc2.au16[6];
10035 uDstOut.au16[6] = uSrc1.au16[7];
10036 uDstOut.au16[7] = uSrc2.au16[7];
10037 *puDst = uDstOut;
10038}
10039
10040#endif
10041
10042IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10043{
10044 RTUINT128U const uSrc2 = *puSrc2;
10045 RTUINT128U const uSrc1 = *puSrc1;
10046 ASMCompilerBarrier();
10047 RTUINT128U uDstOut;
10048 uDstOut.au16[0] = uSrc1.au16[4];
10049 uDstOut.au16[1] = uSrc2.au16[4];
10050 uDstOut.au16[2] = uSrc1.au16[5];
10051 uDstOut.au16[3] = uSrc2.au16[5];
10052 uDstOut.au16[4] = uSrc1.au16[6];
10053 uDstOut.au16[5] = uSrc2.au16[6];
10054 uDstOut.au16[6] = uSrc1.au16[7];
10055 uDstOut.au16[7] = uSrc2.au16[7];
10056 *puDst = uDstOut;
10057}
10058
10059
10060IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10061{
10062 RTUINT256U const uSrc2 = *puSrc2;
10063 RTUINT256U const uSrc1 = *puSrc1;
10064 ASMCompilerBarrier();
10065 RTUINT256U uDstOut;
10066 uDstOut.au16[0] = uSrc1.au16[4];
10067 uDstOut.au16[1] = uSrc2.au16[4];
10068 uDstOut.au16[2] = uSrc1.au16[5];
10069 uDstOut.au16[3] = uSrc2.au16[5];
10070 uDstOut.au16[4] = uSrc1.au16[6];
10071 uDstOut.au16[5] = uSrc2.au16[6];
10072 uDstOut.au16[6] = uSrc1.au16[7];
10073 uDstOut.au16[7] = uSrc2.au16[7];
10074
10075 uDstOut.au16[8] = uSrc1.au16[12];
10076 uDstOut.au16[9] = uSrc2.au16[12];
10077 uDstOut.au16[10] = uSrc1.au16[13];
10078 uDstOut.au16[11] = uSrc2.au16[13];
10079 uDstOut.au16[12] = uSrc1.au16[14];
10080 uDstOut.au16[13] = uSrc2.au16[14];
10081 uDstOut.au16[14] = uSrc1.au16[15];
10082 uDstOut.au16[15] = uSrc2.au16[15];
10083 *puDst = uDstOut;
10084}
10085
10086
10087/*
10088 * PUNPCKHBW - high dwords -> qword(s)
10089 */
10090#ifdef IEM_WITHOUT_ASSEMBLY
10091
10092IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10093{
10094 RTUINT64U const uSrc2 = { *puSrc };
10095 RTUINT64U const uSrc1 = { *puDst };
10096 ASMCompilerBarrier();
10097 RTUINT64U uDstOut;
10098 uDstOut.au32[0] = uSrc1.au32[1];
10099 uDstOut.au32[1] = uSrc2.au32[1];
10100 *puDst = uDstOut.u;
10101}
10102
10103
10104IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10105{
10106 RTUINT128U const uSrc2 = *puSrc;
10107 RTUINT128U const uSrc1 = *puDst;
10108 ASMCompilerBarrier();
10109 RTUINT128U uDstOut;
10110 uDstOut.au32[0] = uSrc1.au32[2];
10111 uDstOut.au32[1] = uSrc2.au32[2];
10112 uDstOut.au32[2] = uSrc1.au32[3];
10113 uDstOut.au32[3] = uSrc2.au32[3];
10114 *puDst = uDstOut;
10115}
10116
10117#endif
10118
10119IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10120{
10121 RTUINT128U const uSrc2 = *puSrc2;
10122 RTUINT128U const uSrc1 = *puSrc1;
10123 ASMCompilerBarrier();
10124 RTUINT128U uDstOut;
10125 uDstOut.au32[0] = uSrc1.au32[2];
10126 uDstOut.au32[1] = uSrc2.au32[2];
10127 uDstOut.au32[2] = uSrc1.au32[3];
10128 uDstOut.au32[3] = uSrc2.au32[3];
10129 *puDst = uDstOut;
10130}
10131
10132
10133IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10134{
10135 RTUINT256U const uSrc2 = *puSrc2;
10136 RTUINT256U const uSrc1 = *puSrc1;
10137 ASMCompilerBarrier();
10138 RTUINT256U uDstOut;
10139 uDstOut.au32[0] = uSrc1.au32[2];
10140 uDstOut.au32[1] = uSrc2.au32[2];
10141 uDstOut.au32[2] = uSrc1.au32[3];
10142 uDstOut.au32[3] = uSrc2.au32[3];
10143
10144 uDstOut.au32[4] = uSrc1.au32[6];
10145 uDstOut.au32[5] = uSrc2.au32[6];
10146 uDstOut.au32[6] = uSrc1.au32[7];
10147 uDstOut.au32[7] = uSrc2.au32[7];
10148 *puDst = uDstOut;
10149}
10150
10151
10152/*
10153 * PUNPCKHQDQ -> High qwords -> double qword(s).
10154 */
10155#ifdef IEM_WITHOUT_ASSEMBLY
10156IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10157{
10158 RTUINT128U const uSrc2 = *puSrc;
10159 RTUINT128U const uSrc1 = *puDst;
10160 ASMCompilerBarrier();
10161 RTUINT128U uDstOut;
10162 uDstOut.au64[0] = uSrc1.au64[1];
10163 uDstOut.au64[1] = uSrc2.au64[1];
10164 *puDst = uDstOut;
10165}
10166#endif
10167
10168
10169IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10170{
10171 RTUINT128U const uSrc2 = *puSrc2;
10172 RTUINT128U const uSrc1 = *puSrc1;
10173 ASMCompilerBarrier();
10174 RTUINT128U uDstOut;
10175 uDstOut.au64[0] = uSrc1.au64[1];
10176 uDstOut.au64[1] = uSrc2.au64[1];
10177 *puDst = uDstOut;
10178}
10179
10180
10181IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10182{
10183 RTUINT256U const uSrc2 = *puSrc2;
10184 RTUINT256U const uSrc1 = *puSrc1;
10185 ASMCompilerBarrier();
10186 RTUINT256U uDstOut;
10187 uDstOut.au64[0] = uSrc1.au64[1];
10188 uDstOut.au64[1] = uSrc2.au64[1];
10189
10190 uDstOut.au64[2] = uSrc1.au64[3];
10191 uDstOut.au64[3] = uSrc2.au64[3];
10192 *puDst = uDstOut;
10193}
10194
10195
10196/*
10197 * PUNPCKLBW - low bytes -> words
10198 */
10199#ifdef IEM_WITHOUT_ASSEMBLY
10200
10201IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10202{
10203 RTUINT64U const uSrc2 = { *puSrc };
10204 RTUINT64U const uSrc1 = { *puDst };
10205 ASMCompilerBarrier();
10206 RTUINT64U uDstOut;
10207 uDstOut.au8[0] = uSrc1.au8[0];
10208 uDstOut.au8[1] = uSrc2.au8[0];
10209 uDstOut.au8[2] = uSrc1.au8[1];
10210 uDstOut.au8[3] = uSrc2.au8[1];
10211 uDstOut.au8[4] = uSrc1.au8[2];
10212 uDstOut.au8[5] = uSrc2.au8[2];
10213 uDstOut.au8[6] = uSrc1.au8[3];
10214 uDstOut.au8[7] = uSrc2.au8[3];
10215 *puDst = uDstOut.u;
10216}
10217
10218
10219IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10220{
10221 RTUINT128U const uSrc2 = *puSrc;
10222 RTUINT128U const uSrc1 = *puDst;
10223 ASMCompilerBarrier();
10224 RTUINT128U uDstOut;
10225 uDstOut.au8[ 0] = uSrc1.au8[0];
10226 uDstOut.au8[ 1] = uSrc2.au8[0];
10227 uDstOut.au8[ 2] = uSrc1.au8[1];
10228 uDstOut.au8[ 3] = uSrc2.au8[1];
10229 uDstOut.au8[ 4] = uSrc1.au8[2];
10230 uDstOut.au8[ 5] = uSrc2.au8[2];
10231 uDstOut.au8[ 6] = uSrc1.au8[3];
10232 uDstOut.au8[ 7] = uSrc2.au8[3];
10233 uDstOut.au8[ 8] = uSrc1.au8[4];
10234 uDstOut.au8[ 9] = uSrc2.au8[4];
10235 uDstOut.au8[10] = uSrc1.au8[5];
10236 uDstOut.au8[11] = uSrc2.au8[5];
10237 uDstOut.au8[12] = uSrc1.au8[6];
10238 uDstOut.au8[13] = uSrc2.au8[6];
10239 uDstOut.au8[14] = uSrc1.au8[7];
10240 uDstOut.au8[15] = uSrc2.au8[7];
10241 *puDst = uDstOut;
10242}
10243
10244#endif
10245
10246IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10247{
10248 RTUINT128U const uSrc2 = *puSrc2;
10249 RTUINT128U const uSrc1 = *puSrc1;
10250 ASMCompilerBarrier();
10251 RTUINT128U uDstOut;
10252 uDstOut.au8[ 0] = uSrc1.au8[0];
10253 uDstOut.au8[ 1] = uSrc2.au8[0];
10254 uDstOut.au8[ 2] = uSrc1.au8[1];
10255 uDstOut.au8[ 3] = uSrc2.au8[1];
10256 uDstOut.au8[ 4] = uSrc1.au8[2];
10257 uDstOut.au8[ 5] = uSrc2.au8[2];
10258 uDstOut.au8[ 6] = uSrc1.au8[3];
10259 uDstOut.au8[ 7] = uSrc2.au8[3];
10260 uDstOut.au8[ 8] = uSrc1.au8[4];
10261 uDstOut.au8[ 9] = uSrc2.au8[4];
10262 uDstOut.au8[10] = uSrc1.au8[5];
10263 uDstOut.au8[11] = uSrc2.au8[5];
10264 uDstOut.au8[12] = uSrc1.au8[6];
10265 uDstOut.au8[13] = uSrc2.au8[6];
10266 uDstOut.au8[14] = uSrc1.au8[7];
10267 uDstOut.au8[15] = uSrc2.au8[7];
10268 *puDst = uDstOut;
10269}
10270
10271
10272IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10273{
10274 RTUINT256U const uSrc2 = *puSrc2;
10275 RTUINT256U const uSrc1 = *puSrc1;
10276 ASMCompilerBarrier();
10277 RTUINT256U uDstOut;
10278 uDstOut.au8[ 0] = uSrc1.au8[0];
10279 uDstOut.au8[ 1] = uSrc2.au8[0];
10280 uDstOut.au8[ 2] = uSrc1.au8[1];
10281 uDstOut.au8[ 3] = uSrc2.au8[1];
10282 uDstOut.au8[ 4] = uSrc1.au8[2];
10283 uDstOut.au8[ 5] = uSrc2.au8[2];
10284 uDstOut.au8[ 6] = uSrc1.au8[3];
10285 uDstOut.au8[ 7] = uSrc2.au8[3];
10286 uDstOut.au8[ 8] = uSrc1.au8[4];
10287 uDstOut.au8[ 9] = uSrc2.au8[4];
10288 uDstOut.au8[10] = uSrc1.au8[5];
10289 uDstOut.au8[11] = uSrc2.au8[5];
10290 uDstOut.au8[12] = uSrc1.au8[6];
10291 uDstOut.au8[13] = uSrc2.au8[6];
10292 uDstOut.au8[14] = uSrc1.au8[7];
10293 uDstOut.au8[15] = uSrc2.au8[7];
10294 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
10295 uDstOut.au8[16] = uSrc1.au8[16];
10296 uDstOut.au8[17] = uSrc2.au8[16];
10297 uDstOut.au8[18] = uSrc1.au8[17];
10298 uDstOut.au8[19] = uSrc2.au8[17];
10299 uDstOut.au8[20] = uSrc1.au8[18];
10300 uDstOut.au8[21] = uSrc2.au8[18];
10301 uDstOut.au8[22] = uSrc1.au8[19];
10302 uDstOut.au8[23] = uSrc2.au8[19];
10303 uDstOut.au8[24] = uSrc1.au8[20];
10304 uDstOut.au8[25] = uSrc2.au8[20];
10305 uDstOut.au8[26] = uSrc1.au8[21];
10306 uDstOut.au8[27] = uSrc2.au8[21];
10307 uDstOut.au8[28] = uSrc1.au8[22];
10308 uDstOut.au8[29] = uSrc2.au8[22];
10309 uDstOut.au8[30] = uSrc1.au8[23];
10310 uDstOut.au8[31] = uSrc2.au8[23];
10311 *puDst = uDstOut;
10312}
10313
10314
10315/*
10316 * PUNPCKLBW - low words -> dwords
10317 */
10318#ifdef IEM_WITHOUT_ASSEMBLY
10319
10320IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
10321{
10322 RTUINT64U const uSrc2 = { *puSrc };
10323 RTUINT64U const uSrc1 = { *puDst };
10324 ASMCompilerBarrier();
10325 RTUINT64U uDstOut;
10326 uDstOut.au16[0] = uSrc1.au16[0];
10327 uDstOut.au16[1] = uSrc2.au16[0];
10328 uDstOut.au16[2] = uSrc1.au16[1];
10329 uDstOut.au16[3] = uSrc2.au16[1];
10330 *puDst = uDstOut.u;
10331}
10332
10333
10334IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10335{
10336 RTUINT128U const uSrc2 = *puSrc;
10337 RTUINT128U const uSrc1 = *puDst;
10338 ASMCompilerBarrier();
10339 RTUINT128U uDstOut;
10340 uDstOut.au16[0] = uSrc1.au16[0];
10341 uDstOut.au16[1] = uSrc2.au16[0];
10342 uDstOut.au16[2] = uSrc1.au16[1];
10343 uDstOut.au16[3] = uSrc2.au16[1];
10344 uDstOut.au16[4] = uSrc1.au16[2];
10345 uDstOut.au16[5] = uSrc2.au16[2];
10346 uDstOut.au16[6] = uSrc1.au16[3];
10347 uDstOut.au16[7] = uSrc2.au16[3];
10348 *puDst = uDstOut;
10349}
10350
10351#endif
10352
10353IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10354{
10355 RTUINT128U const uSrc2 = *puSrc2;
10356 RTUINT128U const uSrc1 = *puSrc1;
10357 ASMCompilerBarrier();
10358 RTUINT128U uDstOut;
10359 uDstOut.au16[0] = uSrc1.au16[0];
10360 uDstOut.au16[1] = uSrc2.au16[0];
10361 uDstOut.au16[2] = uSrc1.au16[1];
10362 uDstOut.au16[3] = uSrc2.au16[1];
10363 uDstOut.au16[4] = uSrc1.au16[2];
10364 uDstOut.au16[5] = uSrc2.au16[2];
10365 uDstOut.au16[6] = uSrc1.au16[3];
10366 uDstOut.au16[7] = uSrc2.au16[3];
10367 *puDst = uDstOut;
10368}
10369
10370
10371IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10372{
10373 RTUINT256U const uSrc2 = *puSrc2;
10374 RTUINT256U const uSrc1 = *puSrc1;
10375 ASMCompilerBarrier();
10376 RTUINT256U uDstOut;
10377 uDstOut.au16[0] = uSrc1.au16[0];
10378 uDstOut.au16[1] = uSrc2.au16[0];
10379 uDstOut.au16[2] = uSrc1.au16[1];
10380 uDstOut.au16[3] = uSrc2.au16[1];
10381 uDstOut.au16[4] = uSrc1.au16[2];
10382 uDstOut.au16[5] = uSrc2.au16[2];
10383 uDstOut.au16[6] = uSrc1.au16[3];
10384 uDstOut.au16[7] = uSrc2.au16[3];
10385
10386 uDstOut.au16[8] = uSrc1.au16[8];
10387 uDstOut.au16[9] = uSrc2.au16[8];
10388 uDstOut.au16[10] = uSrc1.au16[9];
10389 uDstOut.au16[11] = uSrc2.au16[9];
10390 uDstOut.au16[12] = uSrc1.au16[10];
10391 uDstOut.au16[13] = uSrc2.au16[10];
10392 uDstOut.au16[14] = uSrc1.au16[11];
10393 uDstOut.au16[15] = uSrc2.au16[11];
10394 *puDst = uDstOut;
10395}
10396
10397
10398/*
10399 * PUNPCKLBW - low dwords -> qword(s)
10400 */
10401#ifdef IEM_WITHOUT_ASSEMBLY
10402
10403IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10404{
10405 RTUINT64U const uSrc2 = { *puSrc };
10406 RTUINT64U const uSrc1 = { *puDst };
10407 ASMCompilerBarrier();
10408 RTUINT64U uDstOut;
10409 uDstOut.au32[0] = uSrc1.au32[0];
10410 uDstOut.au32[1] = uSrc2.au32[0];
10411 *puDst = uDstOut.u;
10412}
10413
10414
10415IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10416{
10417 RTUINT128U const uSrc2 = *puSrc;
10418 RTUINT128U const uSrc1 = *puDst;
10419 ASMCompilerBarrier();
10420 RTUINT128U uDstOut;
10421 uDstOut.au32[0] = uSrc1.au32[0];
10422 uDstOut.au32[1] = uSrc2.au32[0];
10423 uDstOut.au32[2] = uSrc1.au32[1];
10424 uDstOut.au32[3] = uSrc2.au32[1];
10425 *puDst = uDstOut;
10426}
10427
10428#endif
10429
10430IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10431{
10432 RTUINT128U const uSrc2 = *puSrc2;
10433 RTUINT128U const uSrc1 = *puSrc1;
10434 ASMCompilerBarrier();
10435 RTUINT128U uDstOut;
10436 uDstOut.au32[0] = uSrc1.au32[0];
10437 uDstOut.au32[1] = uSrc2.au32[0];
10438 uDstOut.au32[2] = uSrc1.au32[1];
10439 uDstOut.au32[3] = uSrc2.au32[1];
10440 *puDst = uDstOut;
10441}
10442
10443
10444IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10445{
10446 RTUINT256U const uSrc2 = *puSrc2;
10447 RTUINT256U const uSrc1 = *puSrc1;
10448 ASMCompilerBarrier();
10449 RTUINT256U uDstOut;
10450 uDstOut.au32[0] = uSrc1.au32[0];
10451 uDstOut.au32[1] = uSrc2.au32[0];
10452 uDstOut.au32[2] = uSrc1.au32[1];
10453 uDstOut.au32[3] = uSrc2.au32[1];
10454
10455 uDstOut.au32[4] = uSrc1.au32[4];
10456 uDstOut.au32[5] = uSrc2.au32[4];
10457 uDstOut.au32[6] = uSrc1.au32[5];
10458 uDstOut.au32[7] = uSrc2.au32[5];
10459 *puDst = uDstOut;
10460}
10461
10462
10463/*
10464 * PUNPCKLQDQ -> Low qwords -> double qword(s).
10465 */
10466#ifdef IEM_WITHOUT_ASSEMBLY
10467IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10468{
10469 RTUINT128U const uSrc2 = *puSrc;
10470 RTUINT128U const uSrc1 = *puDst;
10471 ASMCompilerBarrier();
10472 RTUINT128U uDstOut;
10473 uDstOut.au64[0] = uSrc1.au64[0];
10474 uDstOut.au64[1] = uSrc2.au64[0];
10475 *puDst = uDstOut;
10476}
10477#endif
10478
10479
10480IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10481{
10482 RTUINT128U const uSrc2 = *puSrc2;
10483 RTUINT128U const uSrc1 = *puSrc1;
10484 ASMCompilerBarrier();
10485 RTUINT128U uDstOut;
10486 uDstOut.au64[0] = uSrc1.au64[0];
10487 uDstOut.au64[1] = uSrc2.au64[0];
10488 *puDst = uDstOut;
10489}
10490
10491
10492IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10493{
10494 RTUINT256U const uSrc2 = *puSrc2;
10495 RTUINT256U const uSrc1 = *puSrc1;
10496 ASMCompilerBarrier();
10497 RTUINT256U uDstOut;
10498 uDstOut.au64[0] = uSrc1.au64[0];
10499 uDstOut.au64[1] = uSrc2.au64[0];
10500
10501 uDstOut.au64[2] = uSrc1.au64[2];
10502 uDstOut.au64[3] = uSrc2.au64[2];
10503 *puDst = uDstOut;
10504}
10505
10506
10507/*
10508 * PACKSSWB - signed words -> signed bytes
10509 */
10510
10511#ifdef IEM_WITHOUT_ASSEMBLY
10512
10513IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
10514{
10515 RTUINT64U const uSrc2 = { *puSrc };
10516 RTUINT64U const uSrc1 = { *puDst };
10517 ASMCompilerBarrier();
10518 RTUINT64U uDstOut;
10519 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
10520 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
10521 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
10522 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
10523 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
10524 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
10525 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
10526 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
10527 *puDst = uDstOut.u;
10528}
10529
10530
10531IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10532{
10533 RTUINT128U const uSrc2 = *puSrc;
10534 RTUINT128U const uSrc1 = *puDst;
10535 ASMCompilerBarrier();
10536 RTUINT128U uDstOut;
10537 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
10538 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
10539 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
10540 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
10541 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
10542 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
10543 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
10544 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
10545 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
10546 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
10547 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
10548 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
10549 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
10550 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
10551 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
10552 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
10553 *puDst = uDstOut;
10554}
10555
10556#endif
10557
10558IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10559{
10560 RTUINT128U const uSrc2 = *puSrc2;
10561 RTUINT128U const uSrc1 = *puSrc1;
10562 ASMCompilerBarrier();
10563 RTUINT128U uDstOut;
10564 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
10565 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
10566 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
10567 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
10568 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
10569 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
10570 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
10571 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
10572 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
10573 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
10574 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
10575 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
10576 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
10577 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
10578 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
10579 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
10580 *puDst = uDstOut;
10581}
10582
10583
10584IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10585{
10586 RTUINT256U const uSrc2 = *puSrc2;
10587 RTUINT256U const uSrc1 = *puSrc1;
10588 ASMCompilerBarrier();
10589 RTUINT256U uDstOut;
10590 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
10591 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
10592 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
10593 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
10594 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
10595 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
10596 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
10597 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
10598 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
10599 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
10600 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
10601 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
10602 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
10603 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
10604 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
10605 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
10606
10607 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
10608 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
10609 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
10610 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
10611 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
10612 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
10613 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
10614 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
10615 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
10616 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
10617 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
10618 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
10619 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
10620 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
10621 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
10622 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
10623 *puDst = uDstOut;
10624}
10625
10626
10627/*
10628 * PACKUSWB - signed words -> unsigned bytes
10629 */
10630#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
10631 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
10632 ? (uint8_t)(a_iWord) \
10633 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
10634
10635#ifdef IEM_WITHOUT_ASSEMBLY
10636
10637IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
10638{
10639 RTUINT64U const uSrc2 = { *puSrc };
10640 RTUINT64U const uSrc1 = { *puDst };
10641 ASMCompilerBarrier();
10642 RTUINT64U uDstOut;
10643 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
10644 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
10645 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
10646 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
10647 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
10648 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
10649 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
10650 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
10651 *puDst = uDstOut.u;
10652}
10653
10654
10655IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10656{
10657 RTUINT128U const uSrc2 = *puSrc;
10658 RTUINT128U const uSrc1 = *puDst;
10659 ASMCompilerBarrier();
10660 RTUINT128U uDstOut;
10661 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
10662 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
10663 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
10664 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
10665 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
10666 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
10667 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
10668 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
10669 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
10670 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
10671 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
10672 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
10673 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
10674 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
10675 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
10676 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
10677 *puDst = uDstOut;
10678}
10679
10680#endif
10681
10682IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10683{
10684 RTUINT128U const uSrc2 = *puSrc2;
10685 RTUINT128U const uSrc1 = *puSrc1;
10686 ASMCompilerBarrier();
10687 RTUINT128U uDstOut;
10688 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
10689 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
10690 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
10691 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
10692 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
10693 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
10694 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
10695 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
10696 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
10697 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
10698 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
10699 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
10700 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
10701 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
10702 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
10703 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
10704 *puDst = uDstOut;
10705}
10706
10707
10708IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10709{
10710 RTUINT256U const uSrc2 = *puSrc2;
10711 RTUINT256U const uSrc1 = *puSrc1;
10712 ASMCompilerBarrier();
10713 RTUINT256U uDstOut;
10714 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
10715 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
10716 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
10717 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
10718 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
10719 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
10720 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
10721 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
10722 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
10723 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
10724 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
10725 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
10726 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
10727 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
10728 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
10729 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
10730
10731 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
10732 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
10733 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
10734 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
10735 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
10736 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
10737 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
10738 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
10739 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
10740 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
10741 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
10742 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
10743 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
10744 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
10745 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
10746 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
10747 *puDst = uDstOut;
10748}
10749
10750
10751/*
10752 * PACKSSDW - signed dwords -> signed words
10753 */
10754
10755#ifdef IEM_WITHOUT_ASSEMBLY
10756
10757IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10758{
10759 RTUINT64U const uSrc2 = { *puSrc };
10760 RTUINT64U const uSrc1 = { *puDst };
10761 ASMCompilerBarrier();
10762 RTUINT64U uDstOut;
10763 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
10764 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
10765 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
10766 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
10767 *puDst = uDstOut.u;
10768}
10769
10770
10771IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10772{
10773 RTUINT128U const uSrc2 = *puSrc;
10774 RTUINT128U const uSrc1 = *puDst;
10775 ASMCompilerBarrier();
10776 RTUINT128U uDstOut;
10777 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
10778 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
10779 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
10780 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
10781 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
10782 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
10783 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
10784 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
10785 *puDst = uDstOut;
10786}
10787
10788#endif
10789
10790IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10791{
10792 RTUINT128U const uSrc2 = *puSrc2;
10793 RTUINT128U const uSrc1 = *puSrc1;
10794 ASMCompilerBarrier();
10795 RTUINT128U uDstOut;
10796 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
10797 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
10798 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
10799 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
10800 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
10801 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
10802 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
10803 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
10804 *puDst = uDstOut;
10805}
10806
10807
10808IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10809{
10810 RTUINT256U const uSrc2 = *puSrc2;
10811 RTUINT256U const uSrc1 = *puSrc1;
10812 ASMCompilerBarrier();
10813 RTUINT256U uDstOut;
10814 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
10815 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
10816 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
10817 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
10818 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
10819 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
10820 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
10821 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
10822
10823 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
10824 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
10825 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
10826 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
10827 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
10828 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
10829 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
10830 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
10831 *puDst = uDstOut;
10832}
10833
10834
10835/*
10836 * PACKUSDW - signed dwords -> unsigned words
10837 */
10838#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
10839 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
10840 ? (uint16_t)(a_iDword) \
10841 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
10842
10843#ifdef IEM_WITHOUT_ASSEMBLY
10844IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10845{
10846 RTUINT128U const uSrc2 = *puSrc;
10847 RTUINT128U const uSrc1 = *puDst;
10848 ASMCompilerBarrier();
10849 RTUINT128U uDstOut;
10850 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
10851 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
10852 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
10853 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
10854 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
10855 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
10856 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
10857 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
10858 *puDst = uDstOut;
10859}
10860#endif
10861
10862IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10863{
10864 RTUINT128U const uSrc2 = *puSrc2;
10865 RTUINT128U const uSrc1 = *puSrc1;
10866 ASMCompilerBarrier();
10867 RTUINT128U uDstOut;
10868 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
10869 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
10870 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
10871 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
10872 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
10873 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
10874 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
10875 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
10876 *puDst = uDstOut;
10877}
10878
10879
10880IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10881{
10882 RTUINT256U const uSrc2 = *puSrc2;
10883 RTUINT256U const uSrc1 = *puSrc1;
10884 ASMCompilerBarrier();
10885 RTUINT256U uDstOut;
10886 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
10887 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
10888 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
10889 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
10890 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
10891 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
10892 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
10893 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
10894
10895 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
10896 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
10897 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
10898 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
10899 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
10900 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
10901 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
10902 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
10903 *puDst = uDstOut;
10904}
10905
10906
10907/*
10908 * CRC32 (SEE 4.2).
10909 */
10910
10911IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
10912{
10913 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
10914}
10915
10916
10917IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
10918{
10919 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
10920}
10921
10922IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
10923{
10924 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
10925}
10926
10927IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
10928{
10929 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
10930}
10931
10932
10933/*
10934 * PTEST (SSE 4.1) - special as it output only EFLAGS.
10935 */
10936#ifdef IEM_WITHOUT_ASSEMBLY
10937IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
10938{
10939 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
10940 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
10941 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
10942 fEfl |= X86_EFL_ZF;
10943 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
10944 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
10945 fEfl |= X86_EFL_CF;
10946 *pfEFlags = fEfl;
10947}
10948#endif
10949
10950IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
10951{
10952 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
10953 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
10954 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
10955 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
10956 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
10957 fEfl |= X86_EFL_ZF;
10958 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
10959 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
10960 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
10961 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
10962 fEfl |= X86_EFL_CF;
10963 *pfEFlags = fEfl;
10964}
10965
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette