VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 96379

Last change on this file since 96379 was 96379, checked in by vboxsync, 2 years ago

VMM/IEM: Implement cvtss2sd/cvtsd2ss instructions + some streamlinging of the instruction helpers and fixes, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 561.8 KB
Line 
1/* $Id: IEMAllAImplC.cpp 96379 2022-08-20 19:23:07Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*********************************************************************************************************************************
20* Header Files *
21*********************************************************************************************************************************/
22#include "IEMInternal.h"
23#include <VBox/vmm/vmcc.h>
24#include <iprt/errcore.h>
25#include <iprt/x86.h>
26#include <iprt/uint128.h>
27#include <iprt/uint256.h>
28#include <iprt/crc.h>
29
30RT_C_DECLS_BEGIN
31#include <softfloat.h>
32RT_C_DECLS_END
33
34
35/*********************************************************************************************************************************
36* Defined Constants And Macros *
37*********************************************************************************************************************************/
38/** @def IEM_WITHOUT_ASSEMBLY
39 * Enables all the code in this file.
40 */
41#if !defined(IEM_WITHOUT_ASSEMBLY)
42# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
43# define IEM_WITHOUT_ASSEMBLY
44# endif
45#endif
46/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
47#ifdef IEM_WITH_ASSEMBLY
48# undef IEM_WITHOUT_ASSEMBLY
49#endif
50
51/**
52 * Calculates the signed flag value given a result and it's bit width.
53 *
54 * The signed flag (SF) is a duplication of the most significant bit in the
55 * result.
56 *
57 * @returns X86_EFL_SF or 0.
58 * @param a_uResult Unsigned result value.
59 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
60 */
61#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
62 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
63
64/**
65 * Calculates the zero flag value given a result.
66 *
67 * The zero flag (ZF) indicates whether the result is zero or not.
68 *
69 * @returns X86_EFL_ZF or 0.
70 * @param a_uResult Unsigned result value.
71 */
72#define X86_EFL_CALC_ZF(a_uResult) \
73 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
74
75/**
76 * Extracts the OF flag from a OF calculation result.
77 *
78 * These are typically used by concating with a bitcount. The problem is that
79 * 8-bit values needs shifting in the other direction than the others.
80 */
81#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
82#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
83#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
84#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
85
86/**
87 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
88 *
89 * @returns Status bits.
90 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
91 * @param a_uResult Unsigned result value.
92 * @param a_uSrc The source value (for AF calc).
93 * @param a_uDst The original destination value (for AF calc).
94 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
95 * @param a_CfExpr Bool expression for the carry flag (CF).
96 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
97 */
98#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
99 do { \
100 uint32_t fEflTmp = *(a_pfEFlags); \
101 fEflTmp &= ~X86_EFL_STATUS_BITS; \
102 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
103 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
104 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
105 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
106 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
107 \
108 /* Overflow during ADDition happens when both inputs have the same signed \
109 bit value and the result has a different sign bit value. \
110 \
111 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
112 follows that for SUBtraction the signed bit value must differ between \
113 the two inputs and the result's signed bit diff from the first input. \
114 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
115 \
116 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
117 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
118 & RT_BIT_64(a_cBitsWidth - 1)) \
119 & ((a_uResult) ^ (a_uDst)) ); \
120 *(a_pfEFlags) = fEflTmp; \
121 } while (0)
122
123/**
124 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
125 *
126 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
127 * undefined. We do not set AF, as that seems to make the most sense (which
128 * probably makes it the most wrong in real life).
129 *
130 * @returns Status bits.
131 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
132 * @param a_uResult Unsigned result value.
133 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
134 * @param a_fExtra Additional bits to set.
135 */
136#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
137 do { \
138 uint32_t fEflTmp = *(a_pfEFlags); \
139 fEflTmp &= ~X86_EFL_STATUS_BITS; \
140 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
141 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
142 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
143 fEflTmp |= (a_fExtra); \
144 *(a_pfEFlags) = fEflTmp; \
145 } while (0)
146
147
148/*********************************************************************************************************************************
149* Global Variables *
150*********************************************************************************************************************************/
151/**
152 * Parity calculation table.
153 *
154 * This is also used by iemAllAImpl.asm.
155 *
156 * The generator code:
157 * @code
158 * #include <stdio.h>
159 *
160 * int main()
161 * {
162 * unsigned b;
163 * for (b = 0; b < 256; b++)
164 * {
165 * int cOnes = ( b & 1)
166 * + ((b >> 1) & 1)
167 * + ((b >> 2) & 1)
168 * + ((b >> 3) & 1)
169 * + ((b >> 4) & 1)
170 * + ((b >> 5) & 1)
171 * + ((b >> 6) & 1)
172 * + ((b >> 7) & 1);
173 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
174 * b,
175 * (b >> 7) & 1,
176 * (b >> 6) & 1,
177 * (b >> 5) & 1,
178 * (b >> 4) & 1,
179 * (b >> 3) & 1,
180 * (b >> 2) & 1,
181 * (b >> 1) & 1,
182 * b & 1,
183 * cOnes & 1 ? "0" : "X86_EFL_PF");
184 * }
185 * return 0;
186 * }
187 * @endcode
188 */
189uint8_t const g_afParity[256] =
190{
191 /* 0000 = 00000000b */ X86_EFL_PF,
192 /* 0x01 = 00000001b */ 0,
193 /* 0x02 = 00000010b */ 0,
194 /* 0x03 = 00000011b */ X86_EFL_PF,
195 /* 0x04 = 00000100b */ 0,
196 /* 0x05 = 00000101b */ X86_EFL_PF,
197 /* 0x06 = 00000110b */ X86_EFL_PF,
198 /* 0x07 = 00000111b */ 0,
199 /* 0x08 = 00001000b */ 0,
200 /* 0x09 = 00001001b */ X86_EFL_PF,
201 /* 0x0a = 00001010b */ X86_EFL_PF,
202 /* 0x0b = 00001011b */ 0,
203 /* 0x0c = 00001100b */ X86_EFL_PF,
204 /* 0x0d = 00001101b */ 0,
205 /* 0x0e = 00001110b */ 0,
206 /* 0x0f = 00001111b */ X86_EFL_PF,
207 /* 0x10 = 00010000b */ 0,
208 /* 0x11 = 00010001b */ X86_EFL_PF,
209 /* 0x12 = 00010010b */ X86_EFL_PF,
210 /* 0x13 = 00010011b */ 0,
211 /* 0x14 = 00010100b */ X86_EFL_PF,
212 /* 0x15 = 00010101b */ 0,
213 /* 0x16 = 00010110b */ 0,
214 /* 0x17 = 00010111b */ X86_EFL_PF,
215 /* 0x18 = 00011000b */ X86_EFL_PF,
216 /* 0x19 = 00011001b */ 0,
217 /* 0x1a = 00011010b */ 0,
218 /* 0x1b = 00011011b */ X86_EFL_PF,
219 /* 0x1c = 00011100b */ 0,
220 /* 0x1d = 00011101b */ X86_EFL_PF,
221 /* 0x1e = 00011110b */ X86_EFL_PF,
222 /* 0x1f = 00011111b */ 0,
223 /* 0x20 = 00100000b */ 0,
224 /* 0x21 = 00100001b */ X86_EFL_PF,
225 /* 0x22 = 00100010b */ X86_EFL_PF,
226 /* 0x23 = 00100011b */ 0,
227 /* 0x24 = 00100100b */ X86_EFL_PF,
228 /* 0x25 = 00100101b */ 0,
229 /* 0x26 = 00100110b */ 0,
230 /* 0x27 = 00100111b */ X86_EFL_PF,
231 /* 0x28 = 00101000b */ X86_EFL_PF,
232 /* 0x29 = 00101001b */ 0,
233 /* 0x2a = 00101010b */ 0,
234 /* 0x2b = 00101011b */ X86_EFL_PF,
235 /* 0x2c = 00101100b */ 0,
236 /* 0x2d = 00101101b */ X86_EFL_PF,
237 /* 0x2e = 00101110b */ X86_EFL_PF,
238 /* 0x2f = 00101111b */ 0,
239 /* 0x30 = 00110000b */ X86_EFL_PF,
240 /* 0x31 = 00110001b */ 0,
241 /* 0x32 = 00110010b */ 0,
242 /* 0x33 = 00110011b */ X86_EFL_PF,
243 /* 0x34 = 00110100b */ 0,
244 /* 0x35 = 00110101b */ X86_EFL_PF,
245 /* 0x36 = 00110110b */ X86_EFL_PF,
246 /* 0x37 = 00110111b */ 0,
247 /* 0x38 = 00111000b */ 0,
248 /* 0x39 = 00111001b */ X86_EFL_PF,
249 /* 0x3a = 00111010b */ X86_EFL_PF,
250 /* 0x3b = 00111011b */ 0,
251 /* 0x3c = 00111100b */ X86_EFL_PF,
252 /* 0x3d = 00111101b */ 0,
253 /* 0x3e = 00111110b */ 0,
254 /* 0x3f = 00111111b */ X86_EFL_PF,
255 /* 0x40 = 01000000b */ 0,
256 /* 0x41 = 01000001b */ X86_EFL_PF,
257 /* 0x42 = 01000010b */ X86_EFL_PF,
258 /* 0x43 = 01000011b */ 0,
259 /* 0x44 = 01000100b */ X86_EFL_PF,
260 /* 0x45 = 01000101b */ 0,
261 /* 0x46 = 01000110b */ 0,
262 /* 0x47 = 01000111b */ X86_EFL_PF,
263 /* 0x48 = 01001000b */ X86_EFL_PF,
264 /* 0x49 = 01001001b */ 0,
265 /* 0x4a = 01001010b */ 0,
266 /* 0x4b = 01001011b */ X86_EFL_PF,
267 /* 0x4c = 01001100b */ 0,
268 /* 0x4d = 01001101b */ X86_EFL_PF,
269 /* 0x4e = 01001110b */ X86_EFL_PF,
270 /* 0x4f = 01001111b */ 0,
271 /* 0x50 = 01010000b */ X86_EFL_PF,
272 /* 0x51 = 01010001b */ 0,
273 /* 0x52 = 01010010b */ 0,
274 /* 0x53 = 01010011b */ X86_EFL_PF,
275 /* 0x54 = 01010100b */ 0,
276 /* 0x55 = 01010101b */ X86_EFL_PF,
277 /* 0x56 = 01010110b */ X86_EFL_PF,
278 /* 0x57 = 01010111b */ 0,
279 /* 0x58 = 01011000b */ 0,
280 /* 0x59 = 01011001b */ X86_EFL_PF,
281 /* 0x5a = 01011010b */ X86_EFL_PF,
282 /* 0x5b = 01011011b */ 0,
283 /* 0x5c = 01011100b */ X86_EFL_PF,
284 /* 0x5d = 01011101b */ 0,
285 /* 0x5e = 01011110b */ 0,
286 /* 0x5f = 01011111b */ X86_EFL_PF,
287 /* 0x60 = 01100000b */ X86_EFL_PF,
288 /* 0x61 = 01100001b */ 0,
289 /* 0x62 = 01100010b */ 0,
290 /* 0x63 = 01100011b */ X86_EFL_PF,
291 /* 0x64 = 01100100b */ 0,
292 /* 0x65 = 01100101b */ X86_EFL_PF,
293 /* 0x66 = 01100110b */ X86_EFL_PF,
294 /* 0x67 = 01100111b */ 0,
295 /* 0x68 = 01101000b */ 0,
296 /* 0x69 = 01101001b */ X86_EFL_PF,
297 /* 0x6a = 01101010b */ X86_EFL_PF,
298 /* 0x6b = 01101011b */ 0,
299 /* 0x6c = 01101100b */ X86_EFL_PF,
300 /* 0x6d = 01101101b */ 0,
301 /* 0x6e = 01101110b */ 0,
302 /* 0x6f = 01101111b */ X86_EFL_PF,
303 /* 0x70 = 01110000b */ 0,
304 /* 0x71 = 01110001b */ X86_EFL_PF,
305 /* 0x72 = 01110010b */ X86_EFL_PF,
306 /* 0x73 = 01110011b */ 0,
307 /* 0x74 = 01110100b */ X86_EFL_PF,
308 /* 0x75 = 01110101b */ 0,
309 /* 0x76 = 01110110b */ 0,
310 /* 0x77 = 01110111b */ X86_EFL_PF,
311 /* 0x78 = 01111000b */ X86_EFL_PF,
312 /* 0x79 = 01111001b */ 0,
313 /* 0x7a = 01111010b */ 0,
314 /* 0x7b = 01111011b */ X86_EFL_PF,
315 /* 0x7c = 01111100b */ 0,
316 /* 0x7d = 01111101b */ X86_EFL_PF,
317 /* 0x7e = 01111110b */ X86_EFL_PF,
318 /* 0x7f = 01111111b */ 0,
319 /* 0x80 = 10000000b */ 0,
320 /* 0x81 = 10000001b */ X86_EFL_PF,
321 /* 0x82 = 10000010b */ X86_EFL_PF,
322 /* 0x83 = 10000011b */ 0,
323 /* 0x84 = 10000100b */ X86_EFL_PF,
324 /* 0x85 = 10000101b */ 0,
325 /* 0x86 = 10000110b */ 0,
326 /* 0x87 = 10000111b */ X86_EFL_PF,
327 /* 0x88 = 10001000b */ X86_EFL_PF,
328 /* 0x89 = 10001001b */ 0,
329 /* 0x8a = 10001010b */ 0,
330 /* 0x8b = 10001011b */ X86_EFL_PF,
331 /* 0x8c = 10001100b */ 0,
332 /* 0x8d = 10001101b */ X86_EFL_PF,
333 /* 0x8e = 10001110b */ X86_EFL_PF,
334 /* 0x8f = 10001111b */ 0,
335 /* 0x90 = 10010000b */ X86_EFL_PF,
336 /* 0x91 = 10010001b */ 0,
337 /* 0x92 = 10010010b */ 0,
338 /* 0x93 = 10010011b */ X86_EFL_PF,
339 /* 0x94 = 10010100b */ 0,
340 /* 0x95 = 10010101b */ X86_EFL_PF,
341 /* 0x96 = 10010110b */ X86_EFL_PF,
342 /* 0x97 = 10010111b */ 0,
343 /* 0x98 = 10011000b */ 0,
344 /* 0x99 = 10011001b */ X86_EFL_PF,
345 /* 0x9a = 10011010b */ X86_EFL_PF,
346 /* 0x9b = 10011011b */ 0,
347 /* 0x9c = 10011100b */ X86_EFL_PF,
348 /* 0x9d = 10011101b */ 0,
349 /* 0x9e = 10011110b */ 0,
350 /* 0x9f = 10011111b */ X86_EFL_PF,
351 /* 0xa0 = 10100000b */ X86_EFL_PF,
352 /* 0xa1 = 10100001b */ 0,
353 /* 0xa2 = 10100010b */ 0,
354 /* 0xa3 = 10100011b */ X86_EFL_PF,
355 /* 0xa4 = 10100100b */ 0,
356 /* 0xa5 = 10100101b */ X86_EFL_PF,
357 /* 0xa6 = 10100110b */ X86_EFL_PF,
358 /* 0xa7 = 10100111b */ 0,
359 /* 0xa8 = 10101000b */ 0,
360 /* 0xa9 = 10101001b */ X86_EFL_PF,
361 /* 0xaa = 10101010b */ X86_EFL_PF,
362 /* 0xab = 10101011b */ 0,
363 /* 0xac = 10101100b */ X86_EFL_PF,
364 /* 0xad = 10101101b */ 0,
365 /* 0xae = 10101110b */ 0,
366 /* 0xaf = 10101111b */ X86_EFL_PF,
367 /* 0xb0 = 10110000b */ 0,
368 /* 0xb1 = 10110001b */ X86_EFL_PF,
369 /* 0xb2 = 10110010b */ X86_EFL_PF,
370 /* 0xb3 = 10110011b */ 0,
371 /* 0xb4 = 10110100b */ X86_EFL_PF,
372 /* 0xb5 = 10110101b */ 0,
373 /* 0xb6 = 10110110b */ 0,
374 /* 0xb7 = 10110111b */ X86_EFL_PF,
375 /* 0xb8 = 10111000b */ X86_EFL_PF,
376 /* 0xb9 = 10111001b */ 0,
377 /* 0xba = 10111010b */ 0,
378 /* 0xbb = 10111011b */ X86_EFL_PF,
379 /* 0xbc = 10111100b */ 0,
380 /* 0xbd = 10111101b */ X86_EFL_PF,
381 /* 0xbe = 10111110b */ X86_EFL_PF,
382 /* 0xbf = 10111111b */ 0,
383 /* 0xc0 = 11000000b */ X86_EFL_PF,
384 /* 0xc1 = 11000001b */ 0,
385 /* 0xc2 = 11000010b */ 0,
386 /* 0xc3 = 11000011b */ X86_EFL_PF,
387 /* 0xc4 = 11000100b */ 0,
388 /* 0xc5 = 11000101b */ X86_EFL_PF,
389 /* 0xc6 = 11000110b */ X86_EFL_PF,
390 /* 0xc7 = 11000111b */ 0,
391 /* 0xc8 = 11001000b */ 0,
392 /* 0xc9 = 11001001b */ X86_EFL_PF,
393 /* 0xca = 11001010b */ X86_EFL_PF,
394 /* 0xcb = 11001011b */ 0,
395 /* 0xcc = 11001100b */ X86_EFL_PF,
396 /* 0xcd = 11001101b */ 0,
397 /* 0xce = 11001110b */ 0,
398 /* 0xcf = 11001111b */ X86_EFL_PF,
399 /* 0xd0 = 11010000b */ 0,
400 /* 0xd1 = 11010001b */ X86_EFL_PF,
401 /* 0xd2 = 11010010b */ X86_EFL_PF,
402 /* 0xd3 = 11010011b */ 0,
403 /* 0xd4 = 11010100b */ X86_EFL_PF,
404 /* 0xd5 = 11010101b */ 0,
405 /* 0xd6 = 11010110b */ 0,
406 /* 0xd7 = 11010111b */ X86_EFL_PF,
407 /* 0xd8 = 11011000b */ X86_EFL_PF,
408 /* 0xd9 = 11011001b */ 0,
409 /* 0xda = 11011010b */ 0,
410 /* 0xdb = 11011011b */ X86_EFL_PF,
411 /* 0xdc = 11011100b */ 0,
412 /* 0xdd = 11011101b */ X86_EFL_PF,
413 /* 0xde = 11011110b */ X86_EFL_PF,
414 /* 0xdf = 11011111b */ 0,
415 /* 0xe0 = 11100000b */ 0,
416 /* 0xe1 = 11100001b */ X86_EFL_PF,
417 /* 0xe2 = 11100010b */ X86_EFL_PF,
418 /* 0xe3 = 11100011b */ 0,
419 /* 0xe4 = 11100100b */ X86_EFL_PF,
420 /* 0xe5 = 11100101b */ 0,
421 /* 0xe6 = 11100110b */ 0,
422 /* 0xe7 = 11100111b */ X86_EFL_PF,
423 /* 0xe8 = 11101000b */ X86_EFL_PF,
424 /* 0xe9 = 11101001b */ 0,
425 /* 0xea = 11101010b */ 0,
426 /* 0xeb = 11101011b */ X86_EFL_PF,
427 /* 0xec = 11101100b */ 0,
428 /* 0xed = 11101101b */ X86_EFL_PF,
429 /* 0xee = 11101110b */ X86_EFL_PF,
430 /* 0xef = 11101111b */ 0,
431 /* 0xf0 = 11110000b */ X86_EFL_PF,
432 /* 0xf1 = 11110001b */ 0,
433 /* 0xf2 = 11110010b */ 0,
434 /* 0xf3 = 11110011b */ X86_EFL_PF,
435 /* 0xf4 = 11110100b */ 0,
436 /* 0xf5 = 11110101b */ X86_EFL_PF,
437 /* 0xf6 = 11110110b */ X86_EFL_PF,
438 /* 0xf7 = 11110111b */ 0,
439 /* 0xf8 = 11111000b */ 0,
440 /* 0xf9 = 11111001b */ X86_EFL_PF,
441 /* 0xfa = 11111010b */ X86_EFL_PF,
442 /* 0xfb = 11111011b */ 0,
443 /* 0xfc = 11111100b */ X86_EFL_PF,
444 /* 0xfd = 11111101b */ 0,
445 /* 0xfe = 11111110b */ 0,
446 /* 0xff = 11111111b */ X86_EFL_PF,
447};
448
449/* for clang: */
450extern const RTFLOAT32U g_ar32Zero[];
451extern const RTFLOAT64U g_ar64Zero[];
452extern const RTFLOAT80U g_ar80Zero[];
453extern const RTFLOAT80U g_ar80One[];
454extern const RTFLOAT80U g_r80Indefinite;
455extern const RTFLOAT32U g_ar32Infinity[];
456extern const RTFLOAT64U g_ar64Infinity[];
457extern const RTFLOAT80U g_ar80Infinity[];
458extern const RTFLOAT128U g_r128Ln2;
459extern const RTUINT128U g_u128Ln2Mantissa;
460extern const RTUINT128U g_u128Ln2MantissaIntel;
461extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
462extern const RTFLOAT32U g_ar32QNaN[];
463extern const RTFLOAT64U g_ar64QNaN[];
464
465/** Zero values (indexed by fSign). */
466RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
467RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
468RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
469
470/** One values (indexed by fSign). */
471RTFLOAT80U const g_ar80One[] =
472{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
473
474/** Indefinite (negative). */
475RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
476
477/** Infinities (indexed by fSign). */
478RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
479RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
480RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
481
482/** Default QNaNs (indexed by fSign). */
483RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
484RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
485
486
487#if 0
488/** 128-bit floating point constant: 2.0 */
489const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
490#endif
491
492
493/* The next section is generated by tools/IEMGenFpuConstants: */
494
495/** The ln2 constant as 128-bit floating point value.
496 * base-10: 6.93147180559945309417232121458176575e-1
497 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
498 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
499//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
500const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
501/** High precision ln2 value.
502 * base-10: 6.931471805599453094172321214581765680747e-1
503 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
504 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
505const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
506/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
507 * base-10: 6.931471805599453094151379470289064954613e-1
508 * base-16: b.17217f7d1cf79abc0000000000000000@-1
509 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
510const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
511
512/** Horner constants for f2xm1 */
513const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
514{
515 /* a0
516 * base-10: 1.00000000000000000000000000000000000e0
517 * base-16: 1.0000000000000000000000000000@0
518 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
519 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
520 /* a1
521 * base-10: 5.00000000000000000000000000000000000e-1
522 * base-16: 8.0000000000000000000000000000@-1
523 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
524 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
525 /* a2
526 * base-10: 1.66666666666666666666666666666666658e-1
527 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
528 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
529 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
530 /* a3
531 * base-10: 4.16666666666666666666666666666666646e-2
532 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
533 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
534 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
535 /* a4
536 * base-10: 8.33333333333333333333333333333333323e-3
537 * base-16: 2.2222222222222222222222222222@-2
538 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
539 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
540 /* a5
541 * base-10: 1.38888888888888888888888888888888874e-3
542 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
543 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
544 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
545 /* a6
546 * base-10: 1.98412698412698412698412698412698412e-4
547 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
548 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
549 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
550 /* a7
551 * base-10: 2.48015873015873015873015873015873015e-5
552 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
553 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
554 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
555 /* a8
556 * base-10: 2.75573192239858906525573192239858902e-6
557 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
558 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
559 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
560 /* a9
561 * base-10: 2.75573192239858906525573192239858865e-7
562 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
563 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
564 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
565 /* a10
566 * base-10: 2.50521083854417187750521083854417184e-8
567 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
568 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
569 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
570 /* a11
571 * base-10: 2.08767569878680989792100903212014296e-9
572 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
573 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
574 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
575 /* a12
576 * base-10: 1.60590438368216145993923771701549472e-10
577 * base-16: b.092309d43684be51c198e91d7b40@-9
578 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
579 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
580 /* a13
581 * base-10: 1.14707455977297247138516979786821043e-11
582 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
583 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
584 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
585 /* a14
586 * base-10: 7.64716373181981647590113198578806964e-13
587 * base-16: d.73f9f399dc0f88ec32b587746578@-11
588 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
589 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
590 /* a15
591 * base-10: 4.77947733238738529743820749111754352e-14
592 * base-16: d.73f9f399dc0f88ec32b587746578@-12
593 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
594 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
595 /* a16
596 * base-10: 2.81145725434552076319894558301031970e-15
597 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
598 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
599 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
600 /* a17
601 * base-10: 1.56192069685862264622163643500573321e-16
602 * base-16: b.413c31dcbecbbdd8024435161550@-14
603 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
604 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
605 /* a18
606 * base-10: 8.22063524662432971695598123687227980e-18
607 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
608 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
609 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
610 /* a19
611 * base-10: 4.11031762331216485847799061843614006e-19
612 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
613 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
614 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
615 /* a20
616 * base-10: 7.04351638180413298434020229233492164e-20
617 * base-16: 1.4c9ee35db1d1f3c946fdcd48fd88@-16
618 * base-2 : 1.0100110010011110111000110101110110110001110100011111001111001001010001101111110111001101010010001111110110001000e-64 */
619 RTFLOAT128U_INIT_C(0, 0x4c9ee35db1d1, 0xf3c946fdcd48fd88, 0x3fbf),
620 /* a21
621 * base-10: 5.81527769640186708776361513365257702e-20
622 * base-16: 1.129e64bff606a2b9c9fc624481cd@-16
623 * base-2 : 1.0001001010011110011001001011111111110110000001101010001010111001110010011111110001100010010001001000000111001101e-64 */
624 RTFLOAT128U_INIT_C(0, 0x129e64bff606, 0xa2b9c9fc624481cd, 0x3fbf),
625};
626
627
628/*
629 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
630 * it all in C is probably safer atm., optimize what's necessary later, maybe.
631 */
632#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
633
634
635/*********************************************************************************************************************************
636* Binary Operations *
637*********************************************************************************************************************************/
638
639/*
640 * ADD
641 */
642
643IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
644{
645 uint64_t uDst = *puDst;
646 uint64_t uResult = uDst + uSrc;
647 *puDst = uResult;
648 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
649}
650
651# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
654{
655 uint32_t uDst = *puDst;
656 uint32_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
659}
660
661
662IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
663{
664 uint16_t uDst = *puDst;
665 uint16_t uResult = uDst + uSrc;
666 *puDst = uResult;
667 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
668}
669
670
671IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
672{
673 uint8_t uDst = *puDst;
674 uint8_t uResult = uDst + uSrc;
675 *puDst = uResult;
676 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
677}
678
679# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
680
681/*
682 * ADC
683 */
684
685IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
686{
687 if (!(*pfEFlags & X86_EFL_CF))
688 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
689 else
690 {
691 uint64_t uDst = *puDst;
692 uint64_t uResult = uDst + uSrc + 1;
693 *puDst = uResult;
694 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
695 }
696}
697
698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
699
700IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
701{
702 if (!(*pfEFlags & X86_EFL_CF))
703 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
704 else
705 {
706 uint32_t uDst = *puDst;
707 uint32_t uResult = uDst + uSrc + 1;
708 *puDst = uResult;
709 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
710 }
711}
712
713
714IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
715{
716 if (!(*pfEFlags & X86_EFL_CF))
717 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
718 else
719 {
720 uint16_t uDst = *puDst;
721 uint16_t uResult = uDst + uSrc + 1;
722 *puDst = uResult;
723 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
724 }
725}
726
727
728IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
729{
730 if (!(*pfEFlags & X86_EFL_CF))
731 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
732 else
733 {
734 uint8_t uDst = *puDst;
735 uint8_t uResult = uDst + uSrc + 1;
736 *puDst = uResult;
737 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
738 }
739}
740
741# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
742
743/*
744 * SUB
745 */
746
747IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
748{
749 uint64_t uDst = *puDst;
750 uint64_t uResult = uDst - uSrc;
751 *puDst = uResult;
752 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
753}
754
755# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
758{
759 uint32_t uDst = *puDst;
760 uint32_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
763}
764
765
766IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
767{
768 uint16_t uDst = *puDst;
769 uint16_t uResult = uDst - uSrc;
770 *puDst = uResult;
771 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
772}
773
774
775IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
776{
777 uint8_t uDst = *puDst;
778 uint8_t uResult = uDst - uSrc;
779 *puDst = uResult;
780 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
781}
782
783# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
784
785/*
786 * SBB
787 */
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
790{
791 if (!(*pfEFlags & X86_EFL_CF))
792 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
793 else
794 {
795 uint64_t uDst = *puDst;
796 uint64_t uResult = uDst - uSrc - 1;
797 *puDst = uResult;
798 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
799 }
800}
801
802# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
803
804IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
805{
806 if (!(*pfEFlags & X86_EFL_CF))
807 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
808 else
809 {
810 uint32_t uDst = *puDst;
811 uint32_t uResult = uDst - uSrc - 1;
812 *puDst = uResult;
813 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
814 }
815}
816
817
818IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
819{
820 if (!(*pfEFlags & X86_EFL_CF))
821 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
822 else
823 {
824 uint16_t uDst = *puDst;
825 uint16_t uResult = uDst - uSrc - 1;
826 *puDst = uResult;
827 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
828 }
829}
830
831
832IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
833{
834 if (!(*pfEFlags & X86_EFL_CF))
835 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
836 else
837 {
838 uint8_t uDst = *puDst;
839 uint8_t uResult = uDst - uSrc - 1;
840 *puDst = uResult;
841 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
842 }
843}
844
845# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
846
847
848/*
849 * OR
850 */
851
852IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
853{
854 uint64_t uResult = *puDst | uSrc;
855 *puDst = uResult;
856 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
857}
858
859# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
860
861IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
862{
863 uint32_t uResult = *puDst | uSrc;
864 *puDst = uResult;
865 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
866}
867
868
869IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
870{
871 uint16_t uResult = *puDst | uSrc;
872 *puDst = uResult;
873 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
874}
875
876
877IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
878{
879 uint8_t uResult = *puDst | uSrc;
880 *puDst = uResult;
881 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
882}
883
884# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
885
886/*
887 * XOR
888 */
889
890IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
891{
892 uint64_t uResult = *puDst ^ uSrc;
893 *puDst = uResult;
894 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
895}
896
897# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
898
899IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
900{
901 uint32_t uResult = *puDst ^ uSrc;
902 *puDst = uResult;
903 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
904}
905
906
907IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
908{
909 uint16_t uResult = *puDst ^ uSrc;
910 *puDst = uResult;
911 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
912}
913
914
915IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
916{
917 uint8_t uResult = *puDst ^ uSrc;
918 *puDst = uResult;
919 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
920}
921
922# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
923
924/*
925 * AND
926 */
927
928IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
929{
930 uint64_t const uResult = *puDst & uSrc;
931 *puDst = uResult;
932 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
933}
934
935# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
936
937IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
938{
939 uint32_t const uResult = *puDst & uSrc;
940 *puDst = uResult;
941 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
942}
943
944
945IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
946{
947 uint16_t const uResult = *puDst & uSrc;
948 *puDst = uResult;
949 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
950}
951
952
953IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
954{
955 uint8_t const uResult = *puDst & uSrc;
956 *puDst = uResult;
957 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
958}
959
960# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
961#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
962
963/*
964 * ANDN (BMI1 instruction)
965 */
966
967IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
968{
969 uint64_t const uResult = ~uSrc1 & uSrc2;
970 *puDst = uResult;
971 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
972}
973
974
975IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
976{
977 uint32_t const uResult = ~uSrc1 & uSrc2;
978 *puDst = uResult;
979 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
980}
981
982
983#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
984IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
985{
986 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
987}
988#endif
989
990
991#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
992IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
993{
994 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
995}
996#endif
997
998#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
999
1000/*
1001 * CMP
1002 */
1003
1004IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1005{
1006 uint64_t uDstTmp = *puDst;
1007 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1008}
1009
1010# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1011
1012IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1013{
1014 uint32_t uDstTmp = *puDst;
1015 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1016}
1017
1018
1019IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1020{
1021 uint16_t uDstTmp = *puDst;
1022 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1023}
1024
1025
1026IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1027{
1028 uint8_t uDstTmp = *puDst;
1029 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1030}
1031
1032# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1033
1034/*
1035 * TEST
1036 */
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1039{
1040 uint64_t uResult = *puDst & uSrc;
1041 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1042}
1043
1044# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1045
1046IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1047{
1048 uint32_t uResult = *puDst & uSrc;
1049 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1050}
1051
1052
1053IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1054{
1055 uint16_t uResult = *puDst & uSrc;
1056 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1057}
1058
1059
1060IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1061{
1062 uint8_t uResult = *puDst & uSrc;
1063 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1064}
1065
1066# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1067
1068
1069/*
1070 * LOCK prefixed variants of the above
1071 */
1072
1073/** 64-bit locked binary operand operation. */
1074# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1075 do { \
1076 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1077 uint ## a_cBitsWidth ## _t uTmp; \
1078 uint32_t fEflTmp; \
1079 do \
1080 { \
1081 uTmp = uOld; \
1082 fEflTmp = *pfEFlags; \
1083 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1084 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1085 *pfEFlags = fEflTmp; \
1086 } while (0)
1087
1088
1089#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1090 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1091 uint ## a_cBitsWidth ## _t uSrc, \
1092 uint32_t *pfEFlags)) \
1093 { \
1094 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1095 }
1096
1097EMIT_LOCKED_BIN_OP(add, 64)
1098EMIT_LOCKED_BIN_OP(adc, 64)
1099EMIT_LOCKED_BIN_OP(sub, 64)
1100EMIT_LOCKED_BIN_OP(sbb, 64)
1101EMIT_LOCKED_BIN_OP(or, 64)
1102EMIT_LOCKED_BIN_OP(xor, 64)
1103EMIT_LOCKED_BIN_OP(and, 64)
1104# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1105EMIT_LOCKED_BIN_OP(add, 32)
1106EMIT_LOCKED_BIN_OP(adc, 32)
1107EMIT_LOCKED_BIN_OP(sub, 32)
1108EMIT_LOCKED_BIN_OP(sbb, 32)
1109EMIT_LOCKED_BIN_OP(or, 32)
1110EMIT_LOCKED_BIN_OP(xor, 32)
1111EMIT_LOCKED_BIN_OP(and, 32)
1112
1113EMIT_LOCKED_BIN_OP(add, 16)
1114EMIT_LOCKED_BIN_OP(adc, 16)
1115EMIT_LOCKED_BIN_OP(sub, 16)
1116EMIT_LOCKED_BIN_OP(sbb, 16)
1117EMIT_LOCKED_BIN_OP(or, 16)
1118EMIT_LOCKED_BIN_OP(xor, 16)
1119EMIT_LOCKED_BIN_OP(and, 16)
1120
1121EMIT_LOCKED_BIN_OP(add, 8)
1122EMIT_LOCKED_BIN_OP(adc, 8)
1123EMIT_LOCKED_BIN_OP(sub, 8)
1124EMIT_LOCKED_BIN_OP(sbb, 8)
1125EMIT_LOCKED_BIN_OP(or, 8)
1126EMIT_LOCKED_BIN_OP(xor, 8)
1127EMIT_LOCKED_BIN_OP(and, 8)
1128# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1129
1130
1131/*
1132 * Bit operations (same signature as above).
1133 */
1134
1135/*
1136 * BT
1137 */
1138
1139IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1140{
1141 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1142 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1143 Assert(uSrc < 64);
1144 uint64_t uDst = *puDst;
1145 if (uDst & RT_BIT_64(uSrc))
1146 *pfEFlags |= X86_EFL_CF;
1147 else
1148 *pfEFlags &= ~X86_EFL_CF;
1149}
1150
1151# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1152
1153IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1154{
1155 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1156 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1157 Assert(uSrc < 32);
1158 uint32_t uDst = *puDst;
1159 if (uDst & RT_BIT_32(uSrc))
1160 *pfEFlags |= X86_EFL_CF;
1161 else
1162 *pfEFlags &= ~X86_EFL_CF;
1163}
1164
1165IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1166{
1167 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1168 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1169 Assert(uSrc < 16);
1170 uint16_t uDst = *puDst;
1171 if (uDst & RT_BIT_32(uSrc))
1172 *pfEFlags |= X86_EFL_CF;
1173 else
1174 *pfEFlags &= ~X86_EFL_CF;
1175}
1176
1177# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1178
1179/*
1180 * BTC
1181 */
1182
1183IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1184{
1185 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1186 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1187 Assert(uSrc < 64);
1188 uint64_t fMask = RT_BIT_64(uSrc);
1189 uint64_t uDst = *puDst;
1190 if (uDst & fMask)
1191 {
1192 uDst &= ~fMask;
1193 *puDst = uDst;
1194 *pfEFlags |= X86_EFL_CF;
1195 }
1196 else
1197 {
1198 uDst |= fMask;
1199 *puDst = uDst;
1200 *pfEFlags &= ~X86_EFL_CF;
1201 }
1202}
1203
1204# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1205
1206IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1207{
1208 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1209 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1210 Assert(uSrc < 32);
1211 uint32_t fMask = RT_BIT_32(uSrc);
1212 uint32_t uDst = *puDst;
1213 if (uDst & fMask)
1214 {
1215 uDst &= ~fMask;
1216 *puDst = uDst;
1217 *pfEFlags |= X86_EFL_CF;
1218 }
1219 else
1220 {
1221 uDst |= fMask;
1222 *puDst = uDst;
1223 *pfEFlags &= ~X86_EFL_CF;
1224 }
1225}
1226
1227
1228IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1229{
1230 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1231 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1232 Assert(uSrc < 16);
1233 uint16_t fMask = RT_BIT_32(uSrc);
1234 uint16_t uDst = *puDst;
1235 if (uDst & fMask)
1236 {
1237 uDst &= ~fMask;
1238 *puDst = uDst;
1239 *pfEFlags |= X86_EFL_CF;
1240 }
1241 else
1242 {
1243 uDst |= fMask;
1244 *puDst = uDst;
1245 *pfEFlags &= ~X86_EFL_CF;
1246 }
1247}
1248
1249# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1250
1251/*
1252 * BTR
1253 */
1254
1255IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1256{
1257 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1258 logical operation (AND/OR/whatever). */
1259 Assert(uSrc < 64);
1260 uint64_t fMask = RT_BIT_64(uSrc);
1261 uint64_t uDst = *puDst;
1262 if (uDst & fMask)
1263 {
1264 uDst &= ~fMask;
1265 *puDst = uDst;
1266 *pfEFlags |= X86_EFL_CF;
1267 }
1268 else
1269 *pfEFlags &= ~X86_EFL_CF;
1270}
1271
1272# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1273
1274IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1275{
1276 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1277 logical operation (AND/OR/whatever). */
1278 Assert(uSrc < 32);
1279 uint32_t fMask = RT_BIT_32(uSrc);
1280 uint32_t uDst = *puDst;
1281 if (uDst & fMask)
1282 {
1283 uDst &= ~fMask;
1284 *puDst = uDst;
1285 *pfEFlags |= X86_EFL_CF;
1286 }
1287 else
1288 *pfEFlags &= ~X86_EFL_CF;
1289}
1290
1291
1292IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1293{
1294 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1295 logical operation (AND/OR/whatever). */
1296 Assert(uSrc < 16);
1297 uint16_t fMask = RT_BIT_32(uSrc);
1298 uint16_t uDst = *puDst;
1299 if (uDst & fMask)
1300 {
1301 uDst &= ~fMask;
1302 *puDst = uDst;
1303 *pfEFlags |= X86_EFL_CF;
1304 }
1305 else
1306 *pfEFlags &= ~X86_EFL_CF;
1307}
1308
1309# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1310
1311/*
1312 * BTS
1313 */
1314
1315IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1316{
1317 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1318 logical operation (AND/OR/whatever). */
1319 Assert(uSrc < 64);
1320 uint64_t fMask = RT_BIT_64(uSrc);
1321 uint64_t uDst = *puDst;
1322 if (uDst & fMask)
1323 *pfEFlags |= X86_EFL_CF;
1324 else
1325 {
1326 uDst |= fMask;
1327 *puDst = uDst;
1328 *pfEFlags &= ~X86_EFL_CF;
1329 }
1330}
1331
1332# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1333
1334IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1335{
1336 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1337 logical operation (AND/OR/whatever). */
1338 Assert(uSrc < 32);
1339 uint32_t fMask = RT_BIT_32(uSrc);
1340 uint32_t uDst = *puDst;
1341 if (uDst & fMask)
1342 *pfEFlags |= X86_EFL_CF;
1343 else
1344 {
1345 uDst |= fMask;
1346 *puDst = uDst;
1347 *pfEFlags &= ~X86_EFL_CF;
1348 }
1349}
1350
1351
1352IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1353{
1354 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1355 logical operation (AND/OR/whatever). */
1356 Assert(uSrc < 16);
1357 uint16_t fMask = RT_BIT_32(uSrc);
1358 uint32_t uDst = *puDst;
1359 if (uDst & fMask)
1360 *pfEFlags |= X86_EFL_CF;
1361 else
1362 {
1363 uDst |= fMask;
1364 *puDst = uDst;
1365 *pfEFlags &= ~X86_EFL_CF;
1366 }
1367}
1368
1369# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1370
1371
1372EMIT_LOCKED_BIN_OP(btc, 64)
1373EMIT_LOCKED_BIN_OP(btr, 64)
1374EMIT_LOCKED_BIN_OP(bts, 64)
1375# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1376EMIT_LOCKED_BIN_OP(btc, 32)
1377EMIT_LOCKED_BIN_OP(btr, 32)
1378EMIT_LOCKED_BIN_OP(bts, 32)
1379
1380EMIT_LOCKED_BIN_OP(btc, 16)
1381EMIT_LOCKED_BIN_OP(btr, 16)
1382EMIT_LOCKED_BIN_OP(bts, 16)
1383# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1384
1385
1386/*
1387 * Helpers for BSR and BSF.
1388 *
1389 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1390 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1391 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1392 * but we restrict ourselves to emulating these recent marchs.
1393 */
1394#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1395 unsigned iBit = (a_iBit); \
1396 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1397 if (iBit) \
1398 { \
1399 *puDst = --iBit; \
1400 fEfl |= g_afParity[iBit]; \
1401 } \
1402 else \
1403 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1404 *pfEFlags = fEfl; \
1405 } while (0)
1406#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1407 unsigned const iBit = (a_iBit); \
1408 if (iBit) \
1409 { \
1410 *puDst = iBit - 1; \
1411 *pfEFlags &= ~X86_EFL_ZF; \
1412 } \
1413 else \
1414 *pfEFlags |= X86_EFL_ZF; \
1415 } while (0)
1416
1417
1418/*
1419 * BSF - first (least significant) bit set
1420 */
1421IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1422{
1423 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1424}
1425
1426IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1427{
1428 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1429}
1430
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1437
1438IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1439{
1440 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1441}
1442
1443IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1444{
1445 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1446}
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453
1454IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1455{
1456 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1457}
1458
1459IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1460{
1461 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1462}
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1470
1471
1472/*
1473 * BSR - last (most significant) bit set
1474 */
1475IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1476{
1477 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1478}
1479
1480IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1481{
1482 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1483}
1484
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1491
1492IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1493{
1494 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1495}
1496
1497IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1498{
1499 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1500}
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507
1508IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1509{
1510 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1511}
1512
1513IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1514{
1515 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1516}
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1524
1525
1526/*
1527 * Helpers for LZCNT and TZCNT.
1528 */
1529#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1530 unsigned const uResult = (a_uResult); \
1531 *(a_puDst) = uResult; \
1532 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1533 if (uResult) \
1534 fEfl |= g_afParity[uResult]; \
1535 else \
1536 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1537 if (!a_uSrc) \
1538 fEfl |= X86_EFL_CF; \
1539 *(a_pfEFlags) = fEfl; \
1540 } while (0)
1541#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1542 unsigned const uResult = (a_uResult); \
1543 *(a_puDst) = uResult; \
1544 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1545 if (!uResult) \
1546 fEfl |= X86_EFL_ZF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551
1552
1553/*
1554 * LZCNT - count leading zero bits.
1555 */
1556IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1557{
1558 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1559}
1560
1561IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1562{
1563 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1564}
1565
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1569}
1570
1571# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1572
1573IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1574{
1575 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1576}
1577
1578IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1579{
1580 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1581}
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1586}
1587
1588
1589IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1590{
1591 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1592}
1593
1594IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1595{
1596 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1597}
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1602}
1603
1604# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1605
1606
1607/*
1608 * TZCNT - count leading zero bits.
1609 */
1610IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1611{
1612 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1613}
1614
1615IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1616{
1617 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1618}
1619
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1623}
1624
1625# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1626
1627IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1628{
1629 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1630}
1631
1632IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1633{
1634 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1635}
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1640}
1641
1642
1643IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1644{
1645 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1646}
1647
1648IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1649{
1650 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1651}
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1656}
1657
1658# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1659#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1660
1661/*
1662 * BEXTR (BMI1 instruction)
1663 */
1664#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1665IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1666 a_Type uSrc2, uint32_t *pfEFlags)) \
1667{ \
1668 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1669 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1670 a_Type uResult; \
1671 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1672 if (iFirstBit < a_cBits) \
1673 { \
1674 uResult = uSrc1 >> iFirstBit; \
1675 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1676 if (cBits < a_cBits) \
1677 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1678 *puDst = uResult; \
1679 if (!uResult) \
1680 fEfl |= X86_EFL_ZF; \
1681 } \
1682 else \
1683 { \
1684 *puDst = uResult = 0; \
1685 fEfl |= X86_EFL_ZF; \
1686 } \
1687 /** @todo complete flag calculations. */ \
1688 *pfEFlags = fEfl; \
1689}
1690
1691EMIT_BEXTR(64, uint64_t, _fallback)
1692EMIT_BEXTR(32, uint32_t, _fallback)
1693#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1694EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1695#endif
1696#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1697EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1698#endif
1699
1700/*
1701 * BLSR (BMI1 instruction)
1702 */
1703#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1704IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1705{ \
1706 uint32_t fEfl1 = *pfEFlags; \
1707 uint32_t fEfl2 = fEfl1; \
1708 *puDst = uSrc; \
1709 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1710 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1711 \
1712 /* AMD: The carry flag is from the SUB operation. */ \
1713 /* 10890xe: PF always cleared? */ \
1714 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1715 fEfl2 |= fEfl1 & X86_EFL_CF; \
1716 *pfEFlags = fEfl2; \
1717}
1718
1719EMIT_BLSR(64, uint64_t, _fallback)
1720EMIT_BLSR(32, uint32_t, _fallback)
1721#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1722EMIT_BLSR(64, uint64_t, RT_NOTHING)
1723#endif
1724#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1725EMIT_BLSR(32, uint32_t, RT_NOTHING)
1726#endif
1727
1728/*
1729 * BLSMSK (BMI1 instruction)
1730 */
1731#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1732IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1733{ \
1734 uint32_t fEfl1 = *pfEFlags; \
1735 uint32_t fEfl2 = fEfl1; \
1736 *puDst = uSrc; \
1737 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1738 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1739 \
1740 /* AMD: The carry flag is from the SUB operation. */ \
1741 /* 10890xe: PF always cleared? */ \
1742 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1743 fEfl2 |= fEfl1 & X86_EFL_CF; \
1744 *pfEFlags = fEfl2; \
1745}
1746
1747EMIT_BLSMSK(64, uint64_t, _fallback)
1748EMIT_BLSMSK(32, uint32_t, _fallback)
1749#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1750EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1751#endif
1752#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1753EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1754#endif
1755
1756/*
1757 * BLSI (BMI1 instruction)
1758 */
1759#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1760IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1761{ \
1762 uint32_t fEfl1 = *pfEFlags; \
1763 uint32_t fEfl2 = fEfl1; \
1764 *puDst = uSrc; \
1765 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1766 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1767 \
1768 /* AMD: The carry flag is from the SUB operation. */ \
1769 /* 10890xe: PF always cleared? */ \
1770 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1771 fEfl2 |= fEfl1 & X86_EFL_CF; \
1772 *pfEFlags = fEfl2; \
1773}
1774
1775EMIT_BLSI(64, uint64_t, _fallback)
1776EMIT_BLSI(32, uint32_t, _fallback)
1777#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1778EMIT_BLSI(64, uint64_t, RT_NOTHING)
1779#endif
1780#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1781EMIT_BLSI(32, uint32_t, RT_NOTHING)
1782#endif
1783
1784/*
1785 * BZHI (BMI2 instruction)
1786 */
1787#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1788IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1789 a_Type uSrc2, uint32_t *pfEFlags)) \
1790{ \
1791 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1792 a_Type uResult; \
1793 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1794 if (iFirstBit < a_cBits) \
1795 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1796 else \
1797 { \
1798 uResult = uSrc1; \
1799 fEfl |= X86_EFL_CF; \
1800 } \
1801 *puDst = uResult; \
1802 fEfl |= X86_EFL_CALC_ZF(uResult); \
1803 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1804 *pfEFlags = fEfl; \
1805}
1806
1807EMIT_BZHI(64, uint64_t, _fallback)
1808EMIT_BZHI(32, uint32_t, _fallback)
1809#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1810EMIT_BZHI(64, uint64_t, RT_NOTHING)
1811#endif
1812#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1813EMIT_BZHI(32, uint32_t, RT_NOTHING)
1814#endif
1815
1816/*
1817 * POPCNT
1818 */
1819RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1820{
1821 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1822 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1823 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1824 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1825};
1826
1827/** @todo Use native popcount where possible and employ some more efficient
1828 * algorithm here (or in asm.h fallback)! */
1829
1830DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1831{
1832 return g_abBitCounts6[ u16 & 0x3f]
1833 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1834 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1835}
1836
1837DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1838{
1839 return g_abBitCounts6[ u32 & 0x3f]
1840 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1841 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1842 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1843 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1844 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1848{
1849 return g_abBitCounts6[ u64 & 0x3f]
1850 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1855 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1856 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1857 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1858 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1859 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1860}
1861
1862#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1863IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1864{ \
1865 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1866 a_Type uResult; \
1867 if (uSrc) \
1868 uResult = iemPopCountU ## a_cBits(uSrc); \
1869 else \
1870 { \
1871 fEfl |= X86_EFL_ZF; \
1872 uResult = 0; \
1873 } \
1874 *puDst = uResult; \
1875 *pfEFlags = fEfl; \
1876}
1877
1878EMIT_POPCNT(64, uint64_t, _fallback)
1879EMIT_POPCNT(32, uint32_t, _fallback)
1880EMIT_POPCNT(16, uint16_t, _fallback)
1881#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1882EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1883#endif
1884#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1885EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1886EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1887#endif
1888
1889
1890#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1891
1892/*
1893 * XCHG
1894 */
1895
1896IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1897{
1898#if ARCH_BITS >= 64
1899 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1900#else
1901 uint64_t uOldMem = *puMem;
1902 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1903 ASMNopPause();
1904 *puReg = uOldMem;
1905#endif
1906}
1907
1908# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1909
1910IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1911{
1912 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1913}
1914
1915
1916IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1917{
1918 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1919}
1920
1921
1922IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1923{
1924 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1925}
1926
1927# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1928
1929
1930/* Unlocked variants for fDisregardLock mode: */
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1933{
1934 uint64_t const uOld = *puMem;
1935 *puMem = *puReg;
1936 *puReg = uOld;
1937}
1938
1939# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1940
1941IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1942{
1943 uint32_t const uOld = *puMem;
1944 *puMem = *puReg;
1945 *puReg = uOld;
1946}
1947
1948
1949IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1950{
1951 uint16_t const uOld = *puMem;
1952 *puMem = *puReg;
1953 *puReg = uOld;
1954}
1955
1956
1957IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1958{
1959 uint8_t const uOld = *puMem;
1960 *puMem = *puReg;
1961 *puReg = uOld;
1962}
1963
1964# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1965
1966
1967/*
1968 * XADD and LOCK XADD.
1969 */
1970#define EMIT_XADD(a_cBitsWidth, a_Type) \
1971IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1972{ \
1973 a_Type uDst = *puDst; \
1974 a_Type uResult = uDst; \
1975 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1976 *puDst = uResult; \
1977 *puReg = uDst; \
1978} \
1979\
1980IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1981{ \
1982 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1983 a_Type uResult; \
1984 uint32_t fEflTmp; \
1985 do \
1986 { \
1987 uResult = uOld; \
1988 fEflTmp = *pfEFlags; \
1989 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
1990 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
1991 *puReg = uOld; \
1992 *pfEFlags = fEflTmp; \
1993}
1994EMIT_XADD(64, uint64_t)
1995# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1996EMIT_XADD(32, uint32_t)
1997EMIT_XADD(16, uint16_t)
1998EMIT_XADD(8, uint8_t)
1999# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2000
2001#endif
2002
2003/*
2004 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2005 *
2006 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2007 * instructions are emulated as locked.
2008 */
2009#if defined(IEM_WITHOUT_ASSEMBLY)
2010
2011IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2012{
2013 uint8_t uOld = *puAl;
2014 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2015 Assert(*puAl == uOld);
2016 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2017}
2018
2019
2020IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2021{
2022 uint16_t uOld = *puAx;
2023 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2024 Assert(*puAx == uOld);
2025 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2026}
2027
2028
2029IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2030{
2031 uint32_t uOld = *puEax;
2032 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2033 Assert(*puEax == uOld);
2034 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2035}
2036
2037
2038# if ARCH_BITS == 32
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2040# else
2041IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2042# endif
2043{
2044# if ARCH_BITS == 32
2045 uint64_t const uSrcReg = *puSrcReg;
2046# endif
2047 uint64_t uOld = *puRax;
2048 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2049 Assert(*puRax == uOld);
2050 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2051}
2052
2053
2054IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2055 uint32_t *pEFlags))
2056{
2057 uint64_t const uNew = pu64EbxEcx->u;
2058 uint64_t const uOld = pu64EaxEdx->u;
2059 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2060 {
2061 Assert(pu64EaxEdx->u == uOld);
2062 *pEFlags |= X86_EFL_ZF;
2063 }
2064 else
2065 *pEFlags &= ~X86_EFL_ZF;
2066}
2067
2068
2069# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2070IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2071 uint32_t *pEFlags))
2072{
2073# ifdef VBOX_STRICT
2074 RTUINT128U const uOld = *pu128RaxRdx;
2075# endif
2076# if defined(RT_ARCH_AMD64)
2077 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2078 &pu128RaxRdx->u))
2079# else
2080 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2081# endif
2082 {
2083 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2084 *pEFlags |= X86_EFL_ZF;
2085 }
2086 else
2087 *pEFlags &= ~X86_EFL_ZF;
2088}
2089# endif
2090
2091#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2092
2093# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2094IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2095 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2096{
2097 RTUINT128U u128Tmp = *pu128Dst;
2098 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2099 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2100 {
2101 *pu128Dst = *pu128RbxRcx;
2102 *pEFlags |= X86_EFL_ZF;
2103 }
2104 else
2105 {
2106 *pu128RaxRdx = u128Tmp;
2107 *pEFlags &= ~X86_EFL_ZF;
2108 }
2109}
2110#endif /* !RT_ARCH_ARM64 */
2111
2112#if defined(IEM_WITHOUT_ASSEMBLY)
2113
2114/* Unlocked versions mapped to the locked ones: */
2115
2116IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2117{
2118 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2119}
2120
2121
2122IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2123{
2124 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2125}
2126
2127
2128IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2129{
2130 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2131}
2132
2133
2134# if ARCH_BITS == 32
2135IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2136{
2137 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2138}
2139# else
2140IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2141{
2142 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2143}
2144# endif
2145
2146
2147IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2148{
2149 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2150}
2151
2152
2153IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2154 uint32_t *pEFlags))
2155{
2156 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2157}
2158
2159#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2160
2161#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2162 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2163
2164/*
2165 * MUL, IMUL, DIV and IDIV helpers.
2166 *
2167 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2168 * division step so we can select between using C operators and
2169 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2170 *
2171 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2172 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2173 * input loads and the result storing.
2174 */
2175
2176DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2177{
2178# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2179 pQuotient->s.Lo = 0;
2180 pQuotient->s.Hi = 0;
2181# endif
2182 RTUINT128U Divisor;
2183 Divisor.s.Lo = u64Divisor;
2184 Divisor.s.Hi = 0;
2185 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2186}
2187
2188# define DIV_LOAD(a_Dividend) \
2189 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2190# define DIV_LOAD_U8(a_Dividend) \
2191 a_Dividend.u = *puAX
2192
2193# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2194# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2195
2196# define MUL_LOAD_F1() *puA
2197# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2198
2199# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2200# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2201
2202# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2203 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2204# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2205 RTUInt128AssignNeg(&(a_Value))
2206
2207# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2208 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2209# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2210 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2211
2212# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2213 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2214 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2215# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2216 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2217
2218
2219/*
2220 * MUL
2221 */
2222# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2223IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2224{ \
2225 RTUINT ## a_cBitsWidth2x ## U Result; \
2226 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2227 a_fnStore(Result); \
2228 \
2229 /* Calc EFLAGS: */ \
2230 uint32_t fEfl = *pfEFlags; \
2231 if (a_fIntelFlags) \
2232 { /* Intel: 6700K and 10980XE behavior */ \
2233 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2234 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2235 fEfl |= X86_EFL_SF; \
2236 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2237 if (Result.s.Hi != 0) \
2238 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2239 } \
2240 else \
2241 { /* AMD: 3990X */ \
2242 if (Result.s.Hi != 0) \
2243 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2244 else \
2245 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2246 } \
2247 *pfEFlags = fEfl; \
2248 return 0; \
2249} \
2250
2251# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2252 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2253 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2254 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2255
2256# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2257EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2258 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2259# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2260EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2261 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2262EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2263 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2264EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2265 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2266# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2267# endif /* !DOXYGEN_RUNNING */
2268
2269/*
2270 * MULX
2271 */
2272# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2273IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2274 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2275{ \
2276 RTUINT ## a_cBitsWidth2x ## U Result; \
2277 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2278 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2279 *puDst1 = Result.s.Hi; \
2280} \
2281
2282# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2283EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2284EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2285# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2286EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2287EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2288# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2289# endif /* !DOXYGEN_RUNNING */
2290
2291
2292/*
2293 * IMUL
2294 *
2295 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2296 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2297 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2298 */
2299# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2300 a_Suffix, a_fIntelFlags) \
2301IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2302{ \
2303 RTUINT ## a_cBitsWidth2x ## U Result; \
2304 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2305 \
2306 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2307 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2308 { \
2309 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2310 { \
2311 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2312 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2313 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2314 } \
2315 else \
2316 { \
2317 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2318 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2319 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2320 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2321 a_fnNeg(Result, a_cBitsWidth2x); \
2322 } \
2323 } \
2324 else \
2325 { \
2326 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2327 { \
2328 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2329 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2330 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2331 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2332 a_fnNeg(Result, a_cBitsWidth2x); \
2333 } \
2334 else \
2335 { \
2336 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2337 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2338 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2339 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2340 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2341 } \
2342 } \
2343 a_fnStore(Result); \
2344 \
2345 if (a_fIntelFlags) \
2346 { \
2347 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2348 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2349 fEfl |= X86_EFL_SF; \
2350 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2351 } \
2352 *pfEFlags = fEfl; \
2353 return 0; \
2354}
2355# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2356 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2357 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2358 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2359
2360# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2361EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2362 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2363# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2364EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2365 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2366EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2367 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2368EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2369 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2370# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2371# endif /* !DOXYGEN_RUNNING */
2372
2373
2374/*
2375 * IMUL with two operands are mapped onto the three operand variant, ignoring
2376 * the high part of the product.
2377 */
2378# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2379IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2380{ \
2381 a_uType uIgn; \
2382 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2383} \
2384\
2385IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2386{ \
2387 a_uType uIgn; \
2388 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2389} \
2390\
2391IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2392{ \
2393 a_uType uIgn; \
2394 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2395}
2396
2397EMIT_IMUL_TWO(64, uint64_t)
2398# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2399EMIT_IMUL_TWO(32, uint32_t)
2400EMIT_IMUL_TWO(16, uint16_t)
2401# endif
2402
2403
2404/*
2405 * DIV
2406 */
2407# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2408 a_Suffix, a_fIntelFlags) \
2409IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2410{ \
2411 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2412 a_fnLoad(Dividend); \
2413 if ( uDivisor != 0 \
2414 && Dividend.s.Hi < uDivisor) \
2415 { \
2416 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2417 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2418 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2419 \
2420 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2421 if (!a_fIntelFlags) \
2422 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2423 return 0; \
2424 } \
2425 /* #DE */ \
2426 return -1; \
2427}
2428# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2429 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2430 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2431 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2432
2433# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2434EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2435 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2436# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2437EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2438 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2439EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2440 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2441EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2442 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2443# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2444# endif /* !DOXYGEN_RUNNING */
2445
2446
2447/*
2448 * IDIV
2449 *
2450 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2451 * set AF and clear PF, ZF and SF just like it does for DIV.
2452 *
2453 */
2454# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2455 a_Suffix, a_fIntelFlags) \
2456IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2457{ \
2458 /* Note! Skylake leaves all flags alone. */ \
2459 \
2460 /** @todo overflow checks */ \
2461 if (uDivisor != 0) \
2462 { \
2463 /* \
2464 * Convert to unsigned division. \
2465 */ \
2466 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2467 a_fnLoad(Dividend); \
2468 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2469 if (fSignedDividend) \
2470 a_fnNeg(Dividend, a_cBitsWidth2x); \
2471 \
2472 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2473 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2474 uDivisorPositive = uDivisor; \
2475 else \
2476 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2477 \
2478 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2479 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2480 \
2481 /* \
2482 * Setup the result, checking for overflows. \
2483 */ \
2484 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2485 { \
2486 if (!fSignedDividend) \
2487 { \
2488 /* Positive divisor, positive dividend => result positive. */ \
2489 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2490 { \
2491 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2492 if (!a_fIntelFlags) \
2493 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2494 return 0; \
2495 } \
2496 } \
2497 else \
2498 { \
2499 /* Positive divisor, negative dividend => result negative. */ \
2500 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2501 { \
2502 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2503 if (!a_fIntelFlags) \
2504 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2505 return 0; \
2506 } \
2507 } \
2508 } \
2509 else \
2510 { \
2511 if (!fSignedDividend) \
2512 { \
2513 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2514 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2515 { \
2516 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2517 if (!a_fIntelFlags) \
2518 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2519 return 0; \
2520 } \
2521 } \
2522 else \
2523 { \
2524 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2525 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2526 { \
2527 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2528 if (!a_fIntelFlags) \
2529 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2530 return 0; \
2531 } \
2532 } \
2533 } \
2534 } \
2535 /* #DE */ \
2536 return -1; \
2537}
2538# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2539 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2540 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2541 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2542
2543# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2544EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2545 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2546# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2547EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2548 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2549EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2550 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2551EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2552 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2553# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2554# endif /* !DOXYGEN_RUNNING */
2555
2556#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2557
2558
2559/*********************************************************************************************************************************
2560* Unary operations. *
2561*********************************************************************************************************************************/
2562#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2563
2564/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2565 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2566 *
2567 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2568 * borrowing in arithmetic loops on intel 8008).
2569 *
2570 * @returns Status bits.
2571 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2572 * @param a_uResult Unsigned result value.
2573 * @param a_uDst The original destination value (for AF calc).
2574 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2575 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2576 */
2577#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2578 do { \
2579 uint32_t fEflTmp = *(a_pfEFlags); \
2580 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2581 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2582 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2583 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2584 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2585 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2586 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2587 *(a_pfEFlags) = fEflTmp; \
2588 } while (0)
2589
2590/*
2591 * INC
2592 */
2593
2594IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2595{
2596 uint64_t uDst = *puDst;
2597 uint64_t uResult = uDst + 1;
2598 *puDst = uResult;
2599 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2600}
2601
2602# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint32_t uDst = *puDst;
2607 uint32_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2610}
2611
2612
2613IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2614{
2615 uint16_t uDst = *puDst;
2616 uint16_t uResult = uDst + 1;
2617 *puDst = uResult;
2618 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2619}
2620
2621IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2622{
2623 uint8_t uDst = *puDst;
2624 uint8_t uResult = uDst + 1;
2625 *puDst = uResult;
2626 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2627}
2628
2629# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2630
2631
2632/*
2633 * DEC
2634 */
2635
2636IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2637{
2638 uint64_t uDst = *puDst;
2639 uint64_t uResult = uDst - 1;
2640 *puDst = uResult;
2641 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2642}
2643
2644# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint32_t uDst = *puDst;
2649 uint32_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2652}
2653
2654
2655IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2656{
2657 uint16_t uDst = *puDst;
2658 uint16_t uResult = uDst - 1;
2659 *puDst = uResult;
2660 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2661}
2662
2663
2664IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2665{
2666 uint8_t uDst = *puDst;
2667 uint8_t uResult = uDst - 1;
2668 *puDst = uResult;
2669 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2670}
2671
2672# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2673
2674
2675/*
2676 * NOT
2677 */
2678
2679IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2680{
2681 uint64_t uDst = *puDst;
2682 uint64_t uResult = ~uDst;
2683 *puDst = uResult;
2684 /* EFLAGS are not modified. */
2685 RT_NOREF_PV(pfEFlags);
2686}
2687
2688# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2689
2690IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2691{
2692 uint32_t uDst = *puDst;
2693 uint32_t uResult = ~uDst;
2694 *puDst = uResult;
2695 /* EFLAGS are not modified. */
2696 RT_NOREF_PV(pfEFlags);
2697}
2698
2699IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2700{
2701 uint16_t uDst = *puDst;
2702 uint16_t uResult = ~uDst;
2703 *puDst = uResult;
2704 /* EFLAGS are not modified. */
2705 RT_NOREF_PV(pfEFlags);
2706}
2707
2708IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2709{
2710 uint8_t uDst = *puDst;
2711 uint8_t uResult = ~uDst;
2712 *puDst = uResult;
2713 /* EFLAGS are not modified. */
2714 RT_NOREF_PV(pfEFlags);
2715}
2716
2717# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2718
2719
2720/*
2721 * NEG
2722 */
2723
2724/**
2725 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2726 *
2727 * @returns Status bits.
2728 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2729 * @param a_uResult Unsigned result value.
2730 * @param a_uDst The original destination value (for AF calc).
2731 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2732 */
2733#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2734 do { \
2735 uint32_t fEflTmp = *(a_pfEFlags); \
2736 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2737 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2738 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2739 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2740 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2741 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2742 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2743 *(a_pfEFlags) = fEflTmp; \
2744 } while (0)
2745
2746IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2747{
2748 uint64_t uDst = *puDst;
2749 uint64_t uResult = (uint64_t)0 - uDst;
2750 *puDst = uResult;
2751 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2752}
2753
2754# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint32_t uDst = *puDst;
2759 uint32_t uResult = (uint32_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2762}
2763
2764
2765IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2766{
2767 uint16_t uDst = *puDst;
2768 uint16_t uResult = (uint16_t)0 - uDst;
2769 *puDst = uResult;
2770 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2771}
2772
2773
2774IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2775{
2776 uint8_t uDst = *puDst;
2777 uint8_t uResult = (uint8_t)0 - uDst;
2778 *puDst = uResult;
2779 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2780}
2781
2782# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2783
2784/*
2785 * Locked variants.
2786 */
2787
2788/** Emit a function for doing a locked unary operand operation. */
2789# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2790 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2791 uint32_t *pfEFlags)) \
2792 { \
2793 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2794 uint ## a_cBitsWidth ## _t uTmp; \
2795 uint32_t fEflTmp; \
2796 do \
2797 { \
2798 uTmp = uOld; \
2799 fEflTmp = *pfEFlags; \
2800 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2801 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2802 *pfEFlags = fEflTmp; \
2803 }
2804
2805EMIT_LOCKED_UNARY_OP(inc, 64)
2806EMIT_LOCKED_UNARY_OP(dec, 64)
2807EMIT_LOCKED_UNARY_OP(not, 64)
2808EMIT_LOCKED_UNARY_OP(neg, 64)
2809# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2810EMIT_LOCKED_UNARY_OP(inc, 32)
2811EMIT_LOCKED_UNARY_OP(dec, 32)
2812EMIT_LOCKED_UNARY_OP(not, 32)
2813EMIT_LOCKED_UNARY_OP(neg, 32)
2814
2815EMIT_LOCKED_UNARY_OP(inc, 16)
2816EMIT_LOCKED_UNARY_OP(dec, 16)
2817EMIT_LOCKED_UNARY_OP(not, 16)
2818EMIT_LOCKED_UNARY_OP(neg, 16)
2819
2820EMIT_LOCKED_UNARY_OP(inc, 8)
2821EMIT_LOCKED_UNARY_OP(dec, 8)
2822EMIT_LOCKED_UNARY_OP(not, 8)
2823EMIT_LOCKED_UNARY_OP(neg, 8)
2824# endif
2825
2826#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2827
2828
2829/*********************************************************************************************************************************
2830* Shifting and Rotating *
2831*********************************************************************************************************************************/
2832
2833/*
2834 * ROL
2835 */
2836#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2837IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2838{ \
2839 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2840 if (cShift) \
2841 { \
2842 if (a_cBitsWidth < 32) \
2843 cShift &= a_cBitsWidth - 1; \
2844 a_uType const uDst = *puDst; \
2845 a_uType const uResult = a_fnHlp(uDst, cShift); \
2846 *puDst = uResult; \
2847 \
2848 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2849 it the same way as for 1 bit shifts. */ \
2850 AssertCompile(X86_EFL_CF_BIT == 0); \
2851 uint32_t fEfl = *pfEFlags; \
2852 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2853 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2854 fEfl |= fCarry; \
2855 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2856 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2857 else /* Intel 10980XE: According to the first sub-shift: */ \
2858 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2859 *pfEFlags = fEfl; \
2860 } \
2861}
2862
2863#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2864EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2865#endif
2866EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2867EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2868
2869#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2870EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2871#endif
2872EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2873EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2874
2875DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2876{
2877 return (uValue << cShift) | (uValue >> (16 - cShift));
2878}
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2881#endif
2882EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2883EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2884
2885DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (8 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2891#endif
2892EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2893EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2894
2895
2896/*
2897 * ROR
2898 */
2899#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2900IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2901{ \
2902 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2903 if (cShift) \
2904 { \
2905 if (a_cBitsWidth < 32) \
2906 cShift &= a_cBitsWidth - 1; \
2907 a_uType const uDst = *puDst; \
2908 a_uType const uResult = a_fnHlp(uDst, cShift); \
2909 *puDst = uResult; \
2910 \
2911 /* Calc EFLAGS: */ \
2912 AssertCompile(X86_EFL_CF_BIT == 0); \
2913 uint32_t fEfl = *pfEFlags; \
2914 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2915 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2916 fEfl |= fCarry; \
2917 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2918 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2919 else /* Intel 10980XE: According to the first sub-shift: */ \
2920 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2921 *pfEFlags = fEfl; \
2922 } \
2923}
2924
2925#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2926EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2927#endif
2928EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2929EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2930
2931#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2932EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2933#endif
2934EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2935EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2936
2937DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2938{
2939 return (uValue >> cShift) | (uValue << (16 - cShift));
2940}
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2943#endif
2944EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2945EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2946
2947DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (8 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2953#endif
2954EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2955EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2956
2957
2958/*
2959 * RCL
2960 */
2961#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2962IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2963{ \
2964 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2965 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2966 cShift %= a_cBitsWidth + 1; \
2967 if (cShift) \
2968 { \
2969 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2970 cShift %= a_cBitsWidth + 1; \
2971 a_uType const uDst = *puDst; \
2972 a_uType uResult = uDst << cShift; \
2973 if (cShift > 1) \
2974 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2975 \
2976 AssertCompile(X86_EFL_CF_BIT == 0); \
2977 uint32_t fEfl = *pfEFlags; \
2978 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2979 uResult |= (a_uType)fInCarry << (cShift - 1); \
2980 \
2981 *puDst = uResult; \
2982 \
2983 /* Calc EFLAGS. */ \
2984 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2985 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2986 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2987 fEfl |= fOutCarry; \
2988 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2989 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
2990 else /* Intel 10980XE: According to the first sub-shift: */ \
2991 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2992 *pfEFlags = fEfl; \
2993 } \
2994}
2995
2996#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2997EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
2998#endif
2999EMIT_RCL(64, uint64_t, _intel, 1)
3000EMIT_RCL(64, uint64_t, _amd, 0)
3001
3002#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3003EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3004#endif
3005EMIT_RCL(32, uint32_t, _intel, 1)
3006EMIT_RCL(32, uint32_t, _amd, 0)
3007
3008#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3009EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3010#endif
3011EMIT_RCL(16, uint16_t, _intel, 1)
3012EMIT_RCL(16, uint16_t, _amd, 0)
3013
3014#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3015EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3016#endif
3017EMIT_RCL(8, uint8_t, _intel, 1)
3018EMIT_RCL(8, uint8_t, _amd, 0)
3019
3020
3021/*
3022 * RCR
3023 */
3024#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3025IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3026{ \
3027 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3028 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3029 cShift %= a_cBitsWidth + 1; \
3030 if (cShift) \
3031 { \
3032 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3033 cShift %= a_cBitsWidth + 1; \
3034 a_uType const uDst = *puDst; \
3035 a_uType uResult = uDst >> cShift; \
3036 if (cShift > 1) \
3037 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3038 \
3039 AssertCompile(X86_EFL_CF_BIT == 0); \
3040 uint32_t fEfl = *pfEFlags; \
3041 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3042 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3043 *puDst = uResult; \
3044 \
3045 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3046 it the same way as for 1 bit shifts. */ \
3047 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3048 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3049 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3050 fEfl |= fOutCarry; \
3051 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3052 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3053 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3054 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3055 *pfEFlags = fEfl; \
3056 } \
3057}
3058
3059#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3060EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3061#endif
3062EMIT_RCR(64, uint64_t, _intel, 1)
3063EMIT_RCR(64, uint64_t, _amd, 0)
3064
3065#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3066EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3067#endif
3068EMIT_RCR(32, uint32_t, _intel, 1)
3069EMIT_RCR(32, uint32_t, _amd, 0)
3070
3071#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3072EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3073#endif
3074EMIT_RCR(16, uint16_t, _intel, 1)
3075EMIT_RCR(16, uint16_t, _amd, 0)
3076
3077#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3078EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3079#endif
3080EMIT_RCR(8, uint8_t, _intel, 1)
3081EMIT_RCR(8, uint8_t, _amd, 0)
3082
3083
3084/*
3085 * SHL
3086 */
3087#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3088IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3089{ \
3090 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3091 if (cShift) \
3092 { \
3093 a_uType const uDst = *puDst; \
3094 a_uType uResult = uDst << cShift; \
3095 *puDst = uResult; \
3096 \
3097 /* Calc EFLAGS. */ \
3098 AssertCompile(X86_EFL_CF_BIT == 0); \
3099 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3100 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3101 fEfl |= fCarry; \
3102 if (!a_fIntelFlags) \
3103 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3104 else \
3105 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3106 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3107 fEfl |= X86_EFL_CALC_ZF(uResult); \
3108 fEfl |= g_afParity[uResult & 0xff]; \
3109 if (!a_fIntelFlags) \
3110 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3111 *pfEFlags = fEfl; \
3112 } \
3113}
3114
3115#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3116EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3117#endif
3118EMIT_SHL(64, uint64_t, _intel, 1)
3119EMIT_SHL(64, uint64_t, _amd, 0)
3120
3121#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3122EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3123#endif
3124EMIT_SHL(32, uint32_t, _intel, 1)
3125EMIT_SHL(32, uint32_t, _amd, 0)
3126
3127#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3128EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3129#endif
3130EMIT_SHL(16, uint16_t, _intel, 1)
3131EMIT_SHL(16, uint16_t, _amd, 0)
3132
3133#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3134EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3135#endif
3136EMIT_SHL(8, uint8_t, _intel, 1)
3137EMIT_SHL(8, uint8_t, _amd, 0)
3138
3139
3140/*
3141 * SHR
3142 */
3143#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3144IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3145{ \
3146 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3147 if (cShift) \
3148 { \
3149 a_uType const uDst = *puDst; \
3150 a_uType uResult = uDst >> cShift; \
3151 *puDst = uResult; \
3152 \
3153 /* Calc EFLAGS. */ \
3154 AssertCompile(X86_EFL_CF_BIT == 0); \
3155 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3156 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3157 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3158 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3159 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3160 fEfl |= X86_EFL_CALC_ZF(uResult); \
3161 fEfl |= g_afParity[uResult & 0xff]; \
3162 if (!a_fIntelFlags) \
3163 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3164 *pfEFlags = fEfl; \
3165 } \
3166}
3167
3168#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3169EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3170#endif
3171EMIT_SHR(64, uint64_t, _intel, 1)
3172EMIT_SHR(64, uint64_t, _amd, 0)
3173
3174#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3175EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3176#endif
3177EMIT_SHR(32, uint32_t, _intel, 1)
3178EMIT_SHR(32, uint32_t, _amd, 0)
3179
3180#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3181EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3182#endif
3183EMIT_SHR(16, uint16_t, _intel, 1)
3184EMIT_SHR(16, uint16_t, _amd, 0)
3185
3186#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3187EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3188#endif
3189EMIT_SHR(8, uint8_t, _intel, 1)
3190EMIT_SHR(8, uint8_t, _amd, 0)
3191
3192
3193/*
3194 * SAR
3195 */
3196#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3197IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3198{ \
3199 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3200 if (cShift) \
3201 { \
3202 a_iType const iDst = (a_iType)*puDst; \
3203 a_uType uResult = iDst >> cShift; \
3204 *puDst = uResult; \
3205 \
3206 /* Calc EFLAGS. \
3207 Note! The OF flag is always zero because the result never differs from the input. */ \
3208 AssertCompile(X86_EFL_CF_BIT == 0); \
3209 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3210 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3211 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3212 fEfl |= X86_EFL_CALC_ZF(uResult); \
3213 fEfl |= g_afParity[uResult & 0xff]; \
3214 if (!a_fIntelFlags) \
3215 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3216 *pfEFlags = fEfl; \
3217 } \
3218}
3219
3220#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3221EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3222#endif
3223EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3224EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3225
3226#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3227EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3228#endif
3229EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3230EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3231
3232#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3233EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3234#endif
3235EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3236EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3237
3238#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3239EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3240#endif
3241EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3242EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3243
3244
3245/*
3246 * SHLD
3247 *
3248 * - CF is the last bit shifted out of puDst.
3249 * - AF is always cleared by Intel 10980XE.
3250 * - AF is always set by AMD 3990X.
3251 * - OF is set according to the first shift on Intel 10980XE, it seems.
3252 * - OF is set according to the last sub-shift on AMD 3990X.
3253 * - ZF, SF and PF are calculated according to the result by both vendors.
3254 *
3255 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3256 * pick either the source register or the destination register for input bits
3257 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3258 * intel has changed behaviour here several times. We implement what current
3259 * skylake based does for now, we can extend this later as needed.
3260 */
3261#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3262IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3263 uint32_t *pfEFlags)) \
3264{ \
3265 cShift &= a_cBitsWidth - 1; \
3266 if (cShift) \
3267 { \
3268 a_uType const uDst = *puDst; \
3269 a_uType uResult = uDst << cShift; \
3270 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3271 *puDst = uResult; \
3272 \
3273 /* CALC EFLAGS: */ \
3274 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3275 if (a_fIntelFlags) \
3276 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3277 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3278 else \
3279 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3280 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3281 fEfl |= X86_EFL_AF; \
3282 } \
3283 AssertCompile(X86_EFL_CF_BIT == 0); \
3284 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3285 fEfl |= g_afParity[uResult & 0xff]; \
3286 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3287 fEfl |= X86_EFL_CALC_ZF(uResult); \
3288 *pfEFlags = fEfl; \
3289 } \
3290}
3291
3292#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3293EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3294#endif
3295EMIT_SHLD(64, uint64_t, _intel, 1)
3296EMIT_SHLD(64, uint64_t, _amd, 0)
3297
3298#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3299EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3300#endif
3301EMIT_SHLD(32, uint32_t, _intel, 1)
3302EMIT_SHLD(32, uint32_t, _amd, 0)
3303
3304#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3305IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3306{ \
3307 cShift &= 31; \
3308 if (cShift) \
3309 { \
3310 uint16_t const uDst = *puDst; \
3311 uint64_t const uTmp = a_fIntelFlags \
3312 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3313 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3314 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3315 *puDst = uResult; \
3316 \
3317 /* CALC EFLAGS: */ \
3318 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3319 AssertCompile(X86_EFL_CF_BIT == 0); \
3320 if (a_fIntelFlags) \
3321 { \
3322 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3323 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3324 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3325 } \
3326 else \
3327 { \
3328 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3329 if (cShift < 16) \
3330 { \
3331 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3332 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3333 } \
3334 else \
3335 { \
3336 if (cShift == 16) \
3337 fEfl |= uDst & X86_EFL_CF; \
3338 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3339 } \
3340 fEfl |= X86_EFL_AF; \
3341 } \
3342 fEfl |= g_afParity[uResult & 0xff]; \
3343 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3344 fEfl |= X86_EFL_CALC_ZF(uResult); \
3345 *pfEFlags = fEfl; \
3346 } \
3347}
3348
3349#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3350EMIT_SHLD_16(RT_NOTHING, 1)
3351#endif
3352EMIT_SHLD_16(_intel, 1)
3353EMIT_SHLD_16(_amd, 0)
3354
3355
3356/*
3357 * SHRD
3358 *
3359 * EFLAGS behaviour seems to be the same as with SHLD:
3360 * - CF is the last bit shifted out of puDst.
3361 * - AF is always cleared by Intel 10980XE.
3362 * - AF is always set by AMD 3990X.
3363 * - OF is set according to the first shift on Intel 10980XE, it seems.
3364 * - OF is set according to the last sub-shift on AMD 3990X.
3365 * - ZF, SF and PF are calculated according to the result by both vendors.
3366 *
3367 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3368 * pick either the source register or the destination register for input bits
3369 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3370 * intel has changed behaviour here several times. We implement what current
3371 * skylake based does for now, we can extend this later as needed.
3372 */
3373#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3374IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3375{ \
3376 cShift &= a_cBitsWidth - 1; \
3377 if (cShift) \
3378 { \
3379 a_uType const uDst = *puDst; \
3380 a_uType uResult = uDst >> cShift; \
3381 uResult |= uSrc << (a_cBitsWidth - cShift); \
3382 *puDst = uResult; \
3383 \
3384 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3385 AssertCompile(X86_EFL_CF_BIT == 0); \
3386 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3387 if (a_fIntelFlags) \
3388 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3389 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3390 else \
3391 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3392 if (cShift > 1) /* Set according to last shift. */ \
3393 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3394 else \
3395 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3396 fEfl |= X86_EFL_AF; \
3397 } \
3398 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3399 fEfl |= X86_EFL_CALC_ZF(uResult); \
3400 fEfl |= g_afParity[uResult & 0xff]; \
3401 *pfEFlags = fEfl; \
3402 } \
3403}
3404
3405#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3406EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3407#endif
3408EMIT_SHRD(64, uint64_t, _intel, 1)
3409EMIT_SHRD(64, uint64_t, _amd, 0)
3410
3411#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3412EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3413#endif
3414EMIT_SHRD(32, uint32_t, _intel, 1)
3415EMIT_SHRD(32, uint32_t, _amd, 0)
3416
3417#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3418IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3419{ \
3420 cShift &= 31; \
3421 if (cShift) \
3422 { \
3423 uint16_t const uDst = *puDst; \
3424 uint64_t const uTmp = a_fIntelFlags \
3425 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3426 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3427 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3428 *puDst = uResult; \
3429 \
3430 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3431 AssertCompile(X86_EFL_CF_BIT == 0); \
3432 if (a_fIntelFlags) \
3433 { \
3434 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3435 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3436 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3437 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3438 } \
3439 else \
3440 { \
3441 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3442 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3443 /* AMD 3990X: Set according to last shift. AF always set. */ \
3444 if (cShift > 1) /* Set according to last shift. */ \
3445 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3446 else \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3448 fEfl |= X86_EFL_AF; \
3449 } \
3450 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3451 fEfl |= X86_EFL_CALC_ZF(uResult); \
3452 fEfl |= g_afParity[uResult & 0xff]; \
3453 *pfEFlags = fEfl; \
3454 } \
3455}
3456
3457#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3458EMIT_SHRD_16(RT_NOTHING, 1)
3459#endif
3460EMIT_SHRD_16(_intel, 1)
3461EMIT_SHRD_16(_amd, 0)
3462
3463
3464/*
3465 * RORX (BMI2)
3466 */
3467#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3468IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3469{ \
3470 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3471}
3472
3473#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3474EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3475#endif
3476#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3477EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3478#endif
3479
3480
3481/*
3482 * SHLX (BMI2)
3483 */
3484#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3485IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3486{ \
3487 cShift &= a_cBitsWidth - 1; \
3488 *puDst = uSrc << cShift; \
3489}
3490
3491#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3492EMIT_SHLX(64, uint64_t, RT_NOTHING)
3493EMIT_SHLX(64, uint64_t, _fallback)
3494#endif
3495#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3496EMIT_SHLX(32, uint32_t, RT_NOTHING)
3497EMIT_SHLX(32, uint32_t, _fallback)
3498#endif
3499
3500
3501/*
3502 * SHRX (BMI2)
3503 */
3504#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3505IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3506{ \
3507 cShift &= a_cBitsWidth - 1; \
3508 *puDst = uSrc >> cShift; \
3509}
3510
3511#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3512EMIT_SHRX(64, uint64_t, RT_NOTHING)
3513EMIT_SHRX(64, uint64_t, _fallback)
3514#endif
3515#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3516EMIT_SHRX(32, uint32_t, RT_NOTHING)
3517EMIT_SHRX(32, uint32_t, _fallback)
3518#endif
3519
3520
3521/*
3522 * SARX (BMI2)
3523 */
3524#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3525IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3526{ \
3527 cShift &= a_cBitsWidth - 1; \
3528 *puDst = (a_iType)uSrc >> cShift; \
3529}
3530
3531#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3532EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3533EMIT_SARX(64, uint64_t, int64_t, _fallback)
3534#endif
3535#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3536EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3537EMIT_SARX(32, uint32_t, int32_t, _fallback)
3538#endif
3539
3540
3541/*
3542 * PDEP (BMI2)
3543 */
3544#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3545IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3546{ \
3547 a_uType uResult = 0; \
3548 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3549 if (fMask & ((a_uType)1 << iMaskBit)) \
3550 { \
3551 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3552 iBit++; \
3553 } \
3554 *puDst = uResult; \
3555}
3556
3557#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3558EMIT_PDEP(64, uint64_t, RT_NOTHING)
3559#endif
3560EMIT_PDEP(64, uint64_t, _fallback)
3561#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3562EMIT_PDEP(32, uint32_t, RT_NOTHING)
3563#endif
3564EMIT_PDEP(32, uint32_t, _fallback)
3565
3566/*
3567 * PEXT (BMI2)
3568 */
3569#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3570IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3571{ \
3572 a_uType uResult = 0; \
3573 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3574 if (fMask & ((a_uType)1 << iMaskBit)) \
3575 { \
3576 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3577 iBit++; \
3578 } \
3579 *puDst = uResult; \
3580}
3581
3582#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3583EMIT_PEXT(64, uint64_t, RT_NOTHING)
3584#endif
3585EMIT_PEXT(64, uint64_t, _fallback)
3586#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3587EMIT_PEXT(32, uint32_t, RT_NOTHING)
3588#endif
3589EMIT_PEXT(32, uint32_t, _fallback)
3590
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593
3594# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3595/*
3596 * BSWAP
3597 */
3598
3599IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3600{
3601 *puDst = ASMByteSwapU64(*puDst);
3602}
3603
3604
3605IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3606{
3607 *puDst = ASMByteSwapU32(*puDst);
3608}
3609
3610
3611/* Note! undocument, so 32-bit arg */
3612IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3613{
3614#if 0
3615 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3616#else
3617 /* This is the behaviour AMD 3990x (64-bit mode): */
3618 *(uint16_t *)puDst = 0;
3619#endif
3620}
3621
3622# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3623
3624
3625
3626# if defined(IEM_WITHOUT_ASSEMBLY)
3627
3628/*
3629 * LFENCE, SFENCE & MFENCE.
3630 */
3631
3632IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3633{
3634 ASMReadFence();
3635}
3636
3637
3638IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3639{
3640 ASMWriteFence();
3641}
3642
3643
3644IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3645{
3646 ASMMemoryFence();
3647}
3648
3649
3650# ifndef RT_ARCH_ARM64
3651IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3652{
3653 ASMMemoryFence();
3654}
3655# endif
3656
3657# endif
3658
3659#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3660
3661
3662IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3663{
3664 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3665 {
3666 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3667 *pu16Dst |= u16Src & X86_SEL_RPL;
3668
3669 *pfEFlags |= X86_EFL_ZF;
3670 }
3671 else
3672 *pfEFlags &= ~X86_EFL_ZF;
3673}
3674
3675
3676#if defined(IEM_WITHOUT_ASSEMBLY)
3677
3678/*********************************************************************************************************************************
3679* x87 FPU Loads *
3680*********************************************************************************************************************************/
3681
3682IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3683{
3684 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3685 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3686 {
3687 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3688 pFpuRes->r80Result.sj64.fInteger = 1;
3689 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3690 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3691 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3692 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3693 }
3694 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3695 {
3696 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3697 pFpuRes->r80Result.s.uExponent = 0;
3698 pFpuRes->r80Result.s.uMantissa = 0;
3699 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3700 }
3701 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3702 {
3703 /* Subnormal values gets normalized. */
3704 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3705 pFpuRes->r80Result.sj64.fInteger = 1;
3706 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3707 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3708 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3709 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3710 pFpuRes->FSW |= X86_FSW_DE;
3711 if (!(pFpuState->FCW & X86_FCW_DM))
3712 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3713 }
3714 else if (RTFLOAT32U_IS_INF(pr32Val))
3715 {
3716 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3717 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3718 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3719 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3720 }
3721 else
3722 {
3723 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3724 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3725 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3726 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3727 pFpuRes->r80Result.sj64.fInteger = 1;
3728 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3729 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3730 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3731 {
3732 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3733 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3734 pFpuRes->FSW |= X86_FSW_IE;
3735
3736 if (!(pFpuState->FCW & X86_FCW_IM))
3737 {
3738 /* The value is not pushed. */
3739 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3740 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3741 pFpuRes->r80Result.au64[0] = 0;
3742 pFpuRes->r80Result.au16[4] = 0;
3743 }
3744 }
3745 else
3746 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3747 }
3748}
3749
3750
3751IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3752{
3753 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3754 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3755 {
3756 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3757 pFpuRes->r80Result.sj64.fInteger = 1;
3758 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3759 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3760 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3761 }
3762 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3763 {
3764 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3765 pFpuRes->r80Result.s.uExponent = 0;
3766 pFpuRes->r80Result.s.uMantissa = 0;
3767 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3768 }
3769 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3770 {
3771 /* Subnormal values gets normalized. */
3772 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3773 pFpuRes->r80Result.sj64.fInteger = 1;
3774 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3775 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3776 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3777 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3778 pFpuRes->FSW |= X86_FSW_DE;
3779 if (!(pFpuState->FCW & X86_FCW_DM))
3780 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3781 }
3782 else if (RTFLOAT64U_IS_INF(pr64Val))
3783 {
3784 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3785 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3786 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3787 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3788 }
3789 else
3790 {
3791 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3792 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3793 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3794 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3795 pFpuRes->r80Result.sj64.fInteger = 1;
3796 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3797 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3798 {
3799 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3800 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3801 pFpuRes->FSW |= X86_FSW_IE;
3802
3803 if (!(pFpuState->FCW & X86_FCW_IM))
3804 {
3805 /* The value is not pushed. */
3806 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3807 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3808 pFpuRes->r80Result.au64[0] = 0;
3809 pFpuRes->r80Result.au16[4] = 0;
3810 }
3811 }
3812 else
3813 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3814 }
3815}
3816
3817
3818IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3819{
3820 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3821 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3822 /* Raises no exceptions. */
3823 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3824}
3825
3826
3827IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3828{
3829 pFpuRes->r80Result.sj64.fSign = 0;
3830 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3831 pFpuRes->r80Result.sj64.fInteger = 1;
3832 pFpuRes->r80Result.sj64.uFraction = 0;
3833
3834 /*
3835 * FPU status word:
3836 * - TOP is irrelevant, but we must match x86 assembly version.
3837 * - C1 is always cleared as we don't have any stack overflows.
3838 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3839 */
3840 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3841}
3842
3843
3844IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3845{
3846 pFpuRes->r80Result.sj64.fSign = 0;
3847 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3848 pFpuRes->r80Result.sj64.fInteger = 1;
3849 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3850 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3851 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3852 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3853}
3854
3855
3856IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3857{
3858 pFpuRes->r80Result.sj64.fSign = 0;
3859 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3860 pFpuRes->r80Result.sj64.fInteger = 1;
3861 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3862 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3863 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3864}
3865
3866
3867IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3868{
3869 pFpuRes->r80Result.sj64.fSign = 0;
3870 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3871 pFpuRes->r80Result.sj64.fInteger = 1;
3872 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3873 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3874 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3875 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3876}
3877
3878
3879IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3880{
3881 pFpuRes->r80Result.sj64.fSign = 0;
3882 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3883 pFpuRes->r80Result.sj64.fInteger = 1;
3884 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3885 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3886 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3887 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3888}
3889
3890
3891IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3892{
3893 pFpuRes->r80Result.sj64.fSign = 0;
3894 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3895 pFpuRes->r80Result.sj64.fInteger = 1;
3896 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3897 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3898 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3899 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3900}
3901
3902
3903IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3904{
3905 pFpuRes->r80Result.s.fSign = 0;
3906 pFpuRes->r80Result.s.uExponent = 0;
3907 pFpuRes->r80Result.s.uMantissa = 0;
3908 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3909}
3910
3911#define EMIT_FILD(a_cBits) \
3912IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3913 int ## a_cBits ## _t const *piVal)) \
3914{ \
3915 int ## a_cBits ## _t iVal = *piVal; \
3916 if (iVal == 0) \
3917 { \
3918 pFpuRes->r80Result.s.fSign = 0; \
3919 pFpuRes->r80Result.s.uExponent = 0; \
3920 pFpuRes->r80Result.s.uMantissa = 0; \
3921 } \
3922 else \
3923 { \
3924 if (iVal > 0) \
3925 pFpuRes->r80Result.s.fSign = 0; \
3926 else \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 1; \
3929 iVal = -iVal; \
3930 } \
3931 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3932 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3933 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3934 } \
3935 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3936}
3937EMIT_FILD(16)
3938EMIT_FILD(32)
3939EMIT_FILD(64)
3940
3941
3942IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3943{
3944 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3945 if ( pd80Val->s.abPairs[0] == 0
3946 && pd80Val->s.abPairs[1] == 0
3947 && pd80Val->s.abPairs[2] == 0
3948 && pd80Val->s.abPairs[3] == 0
3949 && pd80Val->s.abPairs[4] == 0
3950 && pd80Val->s.abPairs[5] == 0
3951 && pd80Val->s.abPairs[6] == 0
3952 && pd80Val->s.abPairs[7] == 0
3953 && pd80Val->s.abPairs[8] == 0)
3954 {
3955 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3956 pFpuRes->r80Result.s.uExponent = 0;
3957 pFpuRes->r80Result.s.uMantissa = 0;
3958 }
3959 else
3960 {
3961 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3962
3963 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3964 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3965 cPairs--;
3966
3967 uint64_t uVal = 0;
3968 uint64_t uFactor = 1;
3969 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3970 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3971 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3972
3973 unsigned const cBits = ASMBitLastSetU64(uVal);
3974 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3975 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3976 }
3977}
3978
3979
3980/*********************************************************************************************************************************
3981* x87 FPU Stores *
3982*********************************************************************************************************************************/
3983
3984/**
3985 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3986 *
3987 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3988 *
3989 * @returns Updated FPU status word value.
3990 * @param fSignIn Incoming sign indicator.
3991 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
3992 * @param iExponentIn Unbiased exponent.
3993 * @param fFcw The FPU control word.
3994 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
3995 * @param pr32Dst Where to return the output value, if one should be
3996 * returned.
3997 *
3998 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
3999 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4000 */
4001static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4002 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4003{
4004 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4005 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4006 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4007 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4008 ? fRoundingOffMask
4009 : 0;
4010 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4011
4012 /*
4013 * Deal with potential overflows/underflows first, optimizing for none.
4014 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4015 */
4016 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4017 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4018 { /* likely? */ }
4019 /*
4020 * Underflow if the exponent zero or negative. This is attempted mapped
4021 * to a subnormal number when possible, with some additional trickery ofc.
4022 */
4023 else if (iExponentOut <= 0)
4024 {
4025 bool const fIsTiny = iExponentOut < 0
4026 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4027 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4028 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4029 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4030
4031 if (iExponentOut <= 0)
4032 {
4033 uMantissaIn = iExponentOut <= -63
4034 ? uMantissaIn != 0
4035 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4036 fRoundedOff = uMantissaIn & fRoundingOffMask;
4037 if (fRoundedOff && fIsTiny)
4038 fFsw |= X86_FSW_UE;
4039 iExponentOut = 0;
4040 }
4041 }
4042 /*
4043 * Overflow if at or above max exponent value or if we will reach max
4044 * when rounding. Will return +/-zero or +/-max value depending on
4045 * whether we're rounding or not.
4046 */
4047 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4048 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4049 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4050 {
4051 fFsw |= X86_FSW_OE;
4052 if (!(fFcw & X86_FCW_OM))
4053 return fFsw | X86_FSW_ES | X86_FSW_B;
4054 fFsw |= X86_FSW_PE;
4055 if (uRoundingAdd)
4056 fFsw |= X86_FSW_C1;
4057 if (!(fFcw & X86_FCW_PM))
4058 fFsw |= X86_FSW_ES | X86_FSW_B;
4059
4060 pr32Dst->s.fSign = fSignIn;
4061 if (uRoundingAdd)
4062 { /* Zero */
4063 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4064 pr32Dst->s.uFraction = 0;
4065 }
4066 else
4067 { /* Max */
4068 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4069 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4070 }
4071 return fFsw;
4072 }
4073
4074 /*
4075 * Normal or subnormal number.
4076 */
4077 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4078 uint64_t uMantissaOut = uMantissaIn;
4079 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4080 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4081 || fRoundedOff != uRoundingAdd)
4082 {
4083 uMantissaOut = uMantissaIn + uRoundingAdd;
4084 if (uMantissaOut >= uMantissaIn)
4085 { /* likely */ }
4086 else
4087 {
4088 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4089 iExponentOut++;
4090 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4091 fFsw |= X86_FSW_C1;
4092 }
4093 }
4094 else
4095 uMantissaOut = uMantissaIn;
4096
4097 /* Truncate the mantissa and set the return value. */
4098 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4099
4100 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4101 pr32Dst->s.uExponent = iExponentOut;
4102 pr32Dst->s.fSign = fSignIn;
4103
4104 /* Set status flags realted to rounding. */
4105 if (fRoundedOff)
4106 {
4107 fFsw |= X86_FSW_PE;
4108 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4109 fFsw |= X86_FSW_C1;
4110 if (!(fFcw & X86_FCW_PM))
4111 fFsw |= X86_FSW_ES | X86_FSW_B;
4112 }
4113
4114 return fFsw;
4115}
4116
4117
4118/**
4119 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4120 */
4121IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4122 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4123{
4124 uint16_t const fFcw = pFpuState->FCW;
4125 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4126 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4127 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4128 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4129 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4130 {
4131 pr32Dst->s.fSign = pr80Src->s.fSign;
4132 pr32Dst->s.uExponent = 0;
4133 pr32Dst->s.uFraction = 0;
4134 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4135 }
4136 else if (RTFLOAT80U_IS_INF(pr80Src))
4137 {
4138 pr32Dst->s.fSign = pr80Src->s.fSign;
4139 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4140 pr32Dst->s.uFraction = 0;
4141 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4142 }
4143 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4144 {
4145 /* Mapped to +/-QNaN */
4146 pr32Dst->s.fSign = pr80Src->s.fSign;
4147 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4148 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4149 }
4150 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4151 {
4152 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4153 if (fFcw & X86_FCW_IM)
4154 {
4155 pr32Dst->s.fSign = 1;
4156 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4157 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4158 fFsw |= X86_FSW_IE;
4159 }
4160 else
4161 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4162 }
4163 else if (RTFLOAT80U_IS_NAN(pr80Src))
4164 {
4165 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4166 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4167 {
4168 pr32Dst->s.fSign = pr80Src->s.fSign;
4169 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4170 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4171 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4172 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4173 fFsw |= X86_FSW_IE;
4174 }
4175 else
4176 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4177 }
4178 else
4179 {
4180 /* Denormal values causes both an underflow and precision exception. */
4181 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4182 if (fFcw & X86_FCW_UM)
4183 {
4184 pr32Dst->s.fSign = pr80Src->s.fSign;
4185 pr32Dst->s.uExponent = 0;
4186 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4187 {
4188 pr32Dst->s.uFraction = 1;
4189 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4190 if (!(fFcw & X86_FCW_PM))
4191 fFsw |= X86_FSW_ES | X86_FSW_B;
4192 }
4193 else
4194 {
4195 pr32Dst->s.uFraction = 0;
4196 fFsw |= X86_FSW_UE | X86_FSW_PE;
4197 if (!(fFcw & X86_FCW_PM))
4198 fFsw |= X86_FSW_ES | X86_FSW_B;
4199 }
4200 }
4201 else
4202 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4203 }
4204 *pu16FSW = fFsw;
4205}
4206
4207
4208/**
4209 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4210 *
4211 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4212 *
4213 * @returns Updated FPU status word value.
4214 * @param fSignIn Incoming sign indicator.
4215 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4216 * @param iExponentIn Unbiased exponent.
4217 * @param fFcw The FPU control word.
4218 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4219 * @param pr64Dst Where to return the output value, if one should be
4220 * returned.
4221 *
4222 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4223 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4224 */
4225static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4226 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4227{
4228 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4229 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4230 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4231 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4232 ? fRoundingOffMask
4233 : 0;
4234 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4235
4236 /*
4237 * Deal with potential overflows/underflows first, optimizing for none.
4238 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4239 */
4240 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4241 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4242 { /* likely? */ }
4243 /*
4244 * Underflow if the exponent zero or negative. This is attempted mapped
4245 * to a subnormal number when possible, with some additional trickery ofc.
4246 */
4247 else if (iExponentOut <= 0)
4248 {
4249 bool const fIsTiny = iExponentOut < 0
4250 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4251 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4252 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4253 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4254
4255 if (iExponentOut <= 0)
4256 {
4257 uMantissaIn = iExponentOut <= -63
4258 ? uMantissaIn != 0
4259 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4260 fRoundedOff = uMantissaIn & fRoundingOffMask;
4261 if (fRoundedOff && fIsTiny)
4262 fFsw |= X86_FSW_UE;
4263 iExponentOut = 0;
4264 }
4265 }
4266 /*
4267 * Overflow if at or above max exponent value or if we will reach max
4268 * when rounding. Will return +/-zero or +/-max value depending on
4269 * whether we're rounding or not.
4270 */
4271 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4272 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4273 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4274 {
4275 fFsw |= X86_FSW_OE;
4276 if (!(fFcw & X86_FCW_OM))
4277 return fFsw | X86_FSW_ES | X86_FSW_B;
4278 fFsw |= X86_FSW_PE;
4279 if (uRoundingAdd)
4280 fFsw |= X86_FSW_C1;
4281 if (!(fFcw & X86_FCW_PM))
4282 fFsw |= X86_FSW_ES | X86_FSW_B;
4283
4284 pr64Dst->s64.fSign = fSignIn;
4285 if (uRoundingAdd)
4286 { /* Zero */
4287 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4288 pr64Dst->s64.uFraction = 0;
4289 }
4290 else
4291 { /* Max */
4292 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4293 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4294 }
4295 return fFsw;
4296 }
4297
4298 /*
4299 * Normal or subnormal number.
4300 */
4301 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4302 uint64_t uMantissaOut = uMantissaIn;
4303 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4304 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4305 || fRoundedOff != uRoundingAdd)
4306 {
4307 uMantissaOut = uMantissaIn + uRoundingAdd;
4308 if (uMantissaOut >= uMantissaIn)
4309 { /* likely */ }
4310 else
4311 {
4312 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4313 iExponentOut++;
4314 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4315 fFsw |= X86_FSW_C1;
4316 }
4317 }
4318 else
4319 uMantissaOut = uMantissaIn;
4320
4321 /* Truncate the mantissa and set the return value. */
4322 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4323
4324 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4325 pr64Dst->s64.uExponent = iExponentOut;
4326 pr64Dst->s64.fSign = fSignIn;
4327
4328 /* Set status flags realted to rounding. */
4329 if (fRoundedOff)
4330 {
4331 fFsw |= X86_FSW_PE;
4332 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4333 fFsw |= X86_FSW_C1;
4334 if (!(fFcw & X86_FCW_PM))
4335 fFsw |= X86_FSW_ES | X86_FSW_B;
4336 }
4337
4338 return fFsw;
4339}
4340
4341
4342/**
4343 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4344 */
4345IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4346 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4347{
4348 uint16_t const fFcw = pFpuState->FCW;
4349 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4350 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4351 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4352 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4353 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4354 {
4355 pr64Dst->s64.fSign = pr80Src->s.fSign;
4356 pr64Dst->s64.uExponent = 0;
4357 pr64Dst->s64.uFraction = 0;
4358 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4359 }
4360 else if (RTFLOAT80U_IS_INF(pr80Src))
4361 {
4362 pr64Dst->s64.fSign = pr80Src->s.fSign;
4363 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4364 pr64Dst->s64.uFraction = 0;
4365 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4366 }
4367 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4368 {
4369 /* Mapped to +/-QNaN */
4370 pr64Dst->s64.fSign = pr80Src->s.fSign;
4371 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4372 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4373 }
4374 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4375 {
4376 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4377 if (fFcw & X86_FCW_IM)
4378 {
4379 pr64Dst->s64.fSign = 1;
4380 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4381 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4382 fFsw |= X86_FSW_IE;
4383 }
4384 else
4385 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4386 }
4387 else if (RTFLOAT80U_IS_NAN(pr80Src))
4388 {
4389 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4390 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4391 {
4392 pr64Dst->s64.fSign = pr80Src->s.fSign;
4393 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4394 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4395 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4396 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4397 fFsw |= X86_FSW_IE;
4398 }
4399 else
4400 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4401 }
4402 else
4403 {
4404 /* Denormal values causes both an underflow and precision exception. */
4405 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4406 if (fFcw & X86_FCW_UM)
4407 {
4408 pr64Dst->s64.fSign = pr80Src->s.fSign;
4409 pr64Dst->s64.uExponent = 0;
4410 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4411 {
4412 pr64Dst->s64.uFraction = 1;
4413 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4414 if (!(fFcw & X86_FCW_PM))
4415 fFsw |= X86_FSW_ES | X86_FSW_B;
4416 }
4417 else
4418 {
4419 pr64Dst->s64.uFraction = 0;
4420 fFsw |= X86_FSW_UE | X86_FSW_PE;
4421 if (!(fFcw & X86_FCW_PM))
4422 fFsw |= X86_FSW_ES | X86_FSW_B;
4423 }
4424 }
4425 else
4426 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4427 }
4428 *pu16FSW = fFsw;
4429}
4430
4431
4432IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4433 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4434{
4435 /*
4436 * FPU status word:
4437 * - TOP is irrelevant, but we must match x86 assembly version (0).
4438 * - C1 is always cleared as we don't have any stack overflows.
4439 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4440 */
4441 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4442 *pr80Dst = *pr80Src;
4443}
4444
4445
4446/*
4447 *
4448 * Mantissa:
4449 * 63 56 48 40 32 24 16 8 0
4450 * v v v v v v v v v
4451 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4452 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4453 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4454 *
4455 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4456 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4457 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4458 * where we'll drop off all but bit 63.
4459 */
4460#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4461IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4462 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4463{ \
4464 uint16_t const fFcw = pFpuState->FCW; \
4465 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4466 bool const fSignIn = pr80Val->s.fSign; \
4467 \
4468 /* \
4469 * Deal with normal numbers first. \
4470 */ \
4471 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4472 { \
4473 uint64_t uMantissa = pr80Val->s.uMantissa; \
4474 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4475 \
4476 if ((uint32_t)iExponent <= a_cBits - 2) \
4477 { \
4478 unsigned const cShiftOff = 63 - iExponent; \
4479 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4480 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4481 ? RT_BIT_64(cShiftOff - 1) \
4482 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4483 ? fRoundingOffMask \
4484 : 0; \
4485 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4486 \
4487 uMantissa >>= cShiftOff; \
4488 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4489 uMantissa += uRounding; \
4490 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4491 { \
4492 if (fRoundedOff) \
4493 { \
4494 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4495 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4496 else if (uRounding) \
4497 fFsw |= X86_FSW_C1; \
4498 fFsw |= X86_FSW_PE; \
4499 if (!(fFcw & X86_FCW_PM)) \
4500 fFsw |= X86_FSW_ES | X86_FSW_B; \
4501 } \
4502 \
4503 if (!fSignIn) \
4504 *piDst = (a_iType)uMantissa; \
4505 else \
4506 *piDst = -(a_iType)uMantissa; \
4507 } \
4508 else \
4509 { \
4510 /* overflowed after rounding. */ \
4511 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4512 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4513 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4514 \
4515 /* Special case for the integer minimum value. */ \
4516 if (fSignIn) \
4517 { \
4518 *piDst = a_iTypeMin; \
4519 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4520 if (!(fFcw & X86_FCW_PM)) \
4521 fFsw |= X86_FSW_ES | X86_FSW_B; \
4522 } \
4523 else \
4524 { \
4525 fFsw |= X86_FSW_IE; \
4526 if (fFcw & X86_FCW_IM) \
4527 *piDst = a_iTypeMin; \
4528 else \
4529 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4530 } \
4531 } \
4532 } \
4533 /* \
4534 * Tiny sub-zero numbers. \
4535 */ \
4536 else if (iExponent < 0) \
4537 { \
4538 if (!fSignIn) \
4539 { \
4540 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4541 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4542 { \
4543 *piDst = 1; \
4544 fFsw |= X86_FSW_C1; \
4545 } \
4546 else \
4547 *piDst = 0; \
4548 } \
4549 else \
4550 { \
4551 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4552 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4553 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4554 *piDst = 0; \
4555 else \
4556 { \
4557 *piDst = -1; \
4558 fFsw |= X86_FSW_C1; \
4559 } \
4560 } \
4561 fFsw |= X86_FSW_PE; \
4562 if (!(fFcw & X86_FCW_PM)) \
4563 fFsw |= X86_FSW_ES | X86_FSW_B; \
4564 } \
4565 /* \
4566 * Special MIN case. \
4567 */ \
4568 else if ( fSignIn && iExponent == a_cBits - 1 \
4569 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4570 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4571 : uMantissa == RT_BIT_64(63))) \
4572 { \
4573 *piDst = a_iTypeMin; \
4574 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4575 { \
4576 fFsw |= X86_FSW_PE; \
4577 if (!(fFcw & X86_FCW_PM)) \
4578 fFsw |= X86_FSW_ES | X86_FSW_B; \
4579 } \
4580 } \
4581 /* \
4582 * Too large/small number outside the target integer range. \
4583 */ \
4584 else \
4585 { \
4586 fFsw |= X86_FSW_IE; \
4587 if (fFcw & X86_FCW_IM) \
4588 *piDst = a_iTypeIndefinite; \
4589 else \
4590 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4591 } \
4592 } \
4593 /* \
4594 * Map both +0 and -0 to integer zero (signless/+). \
4595 */ \
4596 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4597 *piDst = 0; \
4598 /* \
4599 * Denormals are just really tiny sub-zero numbers that are either rounded \
4600 * to zero, 1 or -1 depending on sign and rounding control. \
4601 */ \
4602 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4603 { \
4604 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4605 *piDst = 0; \
4606 else \
4607 { \
4608 *piDst = fSignIn ? -1 : 1; \
4609 fFsw |= X86_FSW_C1; \
4610 } \
4611 fFsw |= X86_FSW_PE; \
4612 if (!(fFcw & X86_FCW_PM)) \
4613 fFsw |= X86_FSW_ES | X86_FSW_B; \
4614 } \
4615 /* \
4616 * All other special values are considered invalid arguments and result \
4617 * in an IE exception and indefinite value if masked. \
4618 */ \
4619 else \
4620 { \
4621 fFsw |= X86_FSW_IE; \
4622 if (fFcw & X86_FCW_IM) \
4623 *piDst = a_iTypeIndefinite; \
4624 else \
4625 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4626 } \
4627 *pu16FSW = fFsw; \
4628}
4629EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4630EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4631EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4632
4633#endif /*IEM_WITHOUT_ASSEMBLY */
4634
4635
4636/*
4637 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4638 *
4639 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4640 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4641 * thus the @a a_cBitsIn.
4642 */
4643#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4644IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4645 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4646{ \
4647 uint16_t const fFcw = pFpuState->FCW; \
4648 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4649 bool const fSignIn = pr80Val->s.fSign; \
4650 \
4651 /* \
4652 * Deal with normal numbers first. \
4653 */ \
4654 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4655 { \
4656 uint64_t uMantissa = pr80Val->s.uMantissa; \
4657 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4658 \
4659 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4660 { \
4661 unsigned const cShiftOff = 63 - iExponent; \
4662 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4663 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4664 uMantissa >>= cShiftOff; \
4665 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4666 if (!fSignIn) \
4667 *piDst = (a_iType)uMantissa; \
4668 else \
4669 *piDst = -(a_iType)uMantissa; \
4670 \
4671 if (fRoundedOff) \
4672 { \
4673 fFsw |= X86_FSW_PE; \
4674 if (!(fFcw & X86_FCW_PM)) \
4675 fFsw |= X86_FSW_ES | X86_FSW_B; \
4676 } \
4677 } \
4678 /* \
4679 * Tiny sub-zero numbers. \
4680 */ \
4681 else if (iExponent < 0) \
4682 { \
4683 *piDst = 0; \
4684 fFsw |= X86_FSW_PE; \
4685 if (!(fFcw & X86_FCW_PM)) \
4686 fFsw |= X86_FSW_ES | X86_FSW_B; \
4687 } \
4688 /* \
4689 * Special MIN case. \
4690 */ \
4691 else if ( fSignIn && iExponent == a_cBits - 1 \
4692 && (a_cBits < 64 \
4693 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4694 : uMantissa == RT_BIT_64(63)) ) \
4695 { \
4696 *piDst = a_iTypeMin; \
4697 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4698 { \
4699 fFsw |= X86_FSW_PE; \
4700 if (!(fFcw & X86_FCW_PM)) \
4701 fFsw |= X86_FSW_ES | X86_FSW_B; \
4702 } \
4703 } \
4704 /* \
4705 * Figure this weirdness. \
4706 */ \
4707 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4708 { \
4709 *piDst = 0; \
4710 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4711 { \
4712 fFsw |= X86_FSW_PE; \
4713 if (!(fFcw & X86_FCW_PM)) \
4714 fFsw |= X86_FSW_ES | X86_FSW_B; \
4715 } \
4716 } \
4717 /* \
4718 * Too large/small number outside the target integer range. \
4719 */ \
4720 else \
4721 { \
4722 fFsw |= X86_FSW_IE; \
4723 if (fFcw & X86_FCW_IM) \
4724 *piDst = a_iTypeIndefinite; \
4725 else \
4726 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4727 } \
4728 } \
4729 /* \
4730 * Map both +0 and -0 to integer zero (signless/+). \
4731 */ \
4732 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4733 *piDst = 0; \
4734 /* \
4735 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4736 */ \
4737 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4738 { \
4739 *piDst = 0; \
4740 fFsw |= X86_FSW_PE; \
4741 if (!(fFcw & X86_FCW_PM)) \
4742 fFsw |= X86_FSW_ES | X86_FSW_B; \
4743 } \
4744 /* \
4745 * All other special values are considered invalid arguments and result \
4746 * in an IE exception and indefinite value if masked. \
4747 */ \
4748 else \
4749 { \
4750 fFsw |= X86_FSW_IE; \
4751 if (fFcw & X86_FCW_IM) \
4752 *piDst = a_iTypeIndefinite; \
4753 else \
4754 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4755 } \
4756 *pu16FSW = fFsw; \
4757}
4758#if defined(IEM_WITHOUT_ASSEMBLY)
4759EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4760EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4761EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4762#endif
4763EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4764EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4765
4766
4767#if defined(IEM_WITHOUT_ASSEMBLY)
4768
4769IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4770 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4771{
4772 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4773 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4774 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4775 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4776 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4777
4778 uint16_t const fFcw = pFpuState->FCW;
4779 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4780 bool const fSignIn = pr80Src->s.fSign;
4781
4782 /*
4783 * Deal with normal numbers first.
4784 */
4785 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4786 {
4787 uint64_t uMantissa = pr80Src->s.uMantissa;
4788 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4789 if ( (uint32_t)iExponent <= 58
4790 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4791 {
4792 unsigned const cShiftOff = 63 - iExponent;
4793 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4794 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4795 ? RT_BIT_64(cShiftOff - 1)
4796 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4797 ? fRoundingOffMask
4798 : 0;
4799 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4800
4801 uMantissa >>= cShiftOff;
4802 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4803 uMantissa += uRounding;
4804 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4805 {
4806 if (fRoundedOff)
4807 {
4808 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4809 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4810 else if (uRounding)
4811 fFsw |= X86_FSW_C1;
4812 fFsw |= X86_FSW_PE;
4813 if (!(fFcw & X86_FCW_PM))
4814 fFsw |= X86_FSW_ES | X86_FSW_B;
4815 }
4816
4817 pd80Dst->s.fSign = fSignIn;
4818 pd80Dst->s.uPad = 0;
4819 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4820 {
4821 unsigned const uDigits = uMantissa % 100;
4822 uMantissa /= 100;
4823 uint8_t const bLo = uDigits % 10;
4824 uint8_t const bHi = uDigits / 10;
4825 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4826 }
4827 }
4828 else
4829 {
4830 /* overflowed after rounding. */
4831 fFsw |= X86_FSW_IE;
4832 if (fFcw & X86_FCW_IM)
4833 *pd80Dst = s_d80Indefinite;
4834 else
4835 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4836 }
4837 }
4838 /*
4839 * Tiny sub-zero numbers.
4840 */
4841 else if (iExponent < 0)
4842 {
4843 if (!fSignIn)
4844 {
4845 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4846 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4847 {
4848 *pd80Dst = s_ad80One[fSignIn];
4849 fFsw |= X86_FSW_C1;
4850 }
4851 else
4852 *pd80Dst = s_ad80Zeros[fSignIn];
4853 }
4854 else
4855 {
4856 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4857 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4858 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4859 *pd80Dst = s_ad80Zeros[fSignIn];
4860 else
4861 {
4862 *pd80Dst = s_ad80One[fSignIn];
4863 fFsw |= X86_FSW_C1;
4864 }
4865 }
4866 fFsw |= X86_FSW_PE;
4867 if (!(fFcw & X86_FCW_PM))
4868 fFsw |= X86_FSW_ES | X86_FSW_B;
4869 }
4870 /*
4871 * Too large/small number outside the target integer range.
4872 */
4873 else
4874 {
4875 fFsw |= X86_FSW_IE;
4876 if (fFcw & X86_FCW_IM)
4877 *pd80Dst = s_d80Indefinite;
4878 else
4879 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4880 }
4881 }
4882 /*
4883 * Map both +0 and -0 to integer zero (signless/+).
4884 */
4885 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4886 *pd80Dst = s_ad80Zeros[fSignIn];
4887 /*
4888 * Denormals are just really tiny sub-zero numbers that are either rounded
4889 * to zero, 1 or -1 depending on sign and rounding control.
4890 */
4891 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4892 {
4893 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4894 *pd80Dst = s_ad80Zeros[fSignIn];
4895 else
4896 {
4897 *pd80Dst = s_ad80One[fSignIn];
4898 fFsw |= X86_FSW_C1;
4899 }
4900 fFsw |= X86_FSW_PE;
4901 if (!(fFcw & X86_FCW_PM))
4902 fFsw |= X86_FSW_ES | X86_FSW_B;
4903 }
4904 /*
4905 * All other special values are considered invalid arguments and result
4906 * in an IE exception and indefinite value if masked.
4907 */
4908 else
4909 {
4910 fFsw |= X86_FSW_IE;
4911 if (fFcw & X86_FCW_IM)
4912 *pd80Dst = s_d80Indefinite;
4913 else
4914 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4915 }
4916 *pu16FSW = fFsw;
4917}
4918
4919
4920/*********************************************************************************************************************************
4921* FPU Helpers *
4922*********************************************************************************************************************************/
4923AssertCompileSize(RTFLOAT128U, 16);
4924AssertCompileSize(RTFLOAT80U, 10);
4925AssertCompileSize(RTFLOAT64U, 8);
4926AssertCompileSize(RTFLOAT32U, 4);
4927
4928/**
4929 * Normalizes a possible pseudo-normal value.
4930 *
4931 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4932 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4933 * i.e. changing uExponent from 0 to 1.
4934 *
4935 * This macro will declare a RTFLOAT80U with the name given by
4936 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4937 * a normalization was performed.
4938 *
4939 * @note This must be applied before calling SoftFloat with a value that couldbe
4940 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4941 * correctly.
4942 */
4943#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4944 RTFLOAT80U a_r80ValNormalized; \
4945 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4946 { \
4947 a_r80ValNormalized = *a_pr80Val; \
4948 a_r80ValNormalized.s.uExponent = 1; \
4949 a_pr80Val = &a_r80ValNormalized; \
4950 } else do {} while (0)
4951
4952#ifdef IEM_WITH_FLOAT128_FOR_FPU
4953
4954DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4955{
4956 int fNew;
4957 switch (fFcw & X86_FCW_RC_MASK)
4958 {
4959 default:
4960 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4961 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4962 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4963 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4964 }
4965 int fOld = fegetround();
4966 fesetround(fNew);
4967 return fOld;
4968}
4969
4970
4971DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4972{
4973 fesetround(fOld);
4974}
4975
4976DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4977{
4978 RT_NOREF(fFcw);
4979 RTFLOAT128U Tmp;
4980 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4981 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4982 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4983 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4984 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4985 {
4986 Assert(Tmp.s.uExponent == 0);
4987 Tmp.s2.uSignAndExponent++;
4988 }
4989 return *(_Float128 *)&Tmp;
4990}
4991
4992
4993DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
4994{
4995 RT_NOREF(fFcw);
4996 RTFLOAT128U Tmp;
4997 *(_Float128 *)&Tmp = rd128ValSrc;
4998 ASMCompilerBarrier();
4999 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5000 {
5001 pr80Dst->s.fSign = Tmp.s64.fSign;
5002 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5003 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5004 | Tmp.s64.uFractionLo >> (64 - 15);
5005
5006 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5007 unsigned const cShiftOff = 64 - 15;
5008 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5009 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5010 if (uRoundedOff)
5011 {
5012 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5013 ? RT_BIT_64(cShiftOff - 1)
5014 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5015 ? fRoundingOffMask
5016 : 0;
5017 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5018 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5019 || uRoundedOff != uRoundingAdd)
5020 {
5021 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5022 {
5023 uFraction += 1;
5024 if (!(uFraction & RT_BIT_64(63)))
5025 { /* likely */ }
5026 else
5027 {
5028 uFraction >>= 1;
5029 pr80Dst->s.uExponent++;
5030 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5031 return fFsw;
5032 }
5033 fFsw |= X86_FSW_C1;
5034 }
5035 }
5036 fFsw |= X86_FSW_PE;
5037 if (!(fFcw & X86_FCW_PM))
5038 fFsw |= X86_FSW_ES | X86_FSW_B;
5039 }
5040 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5041 }
5042 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5043 {
5044 pr80Dst->s.fSign = Tmp.s64.fSign;
5045 pr80Dst->s.uExponent = 0;
5046 pr80Dst->s.uMantissa = 0;
5047 }
5048 else if (RTFLOAT128U_IS_INF(&Tmp))
5049 {
5050 pr80Dst->s.fSign = Tmp.s64.fSign;
5051 pr80Dst->s.uExponent = 0;
5052 pr80Dst->s.uMantissa = 0;
5053 }
5054 return fFsw;
5055}
5056
5057
5058#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5059
5060/** Initializer for the SoftFloat state structure. */
5061# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5062 { \
5063 softfloat_tininess_afterRounding, \
5064 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5065 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5066 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5067 : (uint8_t)softfloat_round_minMag, \
5068 0, \
5069 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5070 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5071 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5072 }
5073
5074/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5075# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5076 ( (a_fFsw) \
5077 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5078 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5079 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5080 ? X86_FSW_ES | X86_FSW_B : 0) )
5081
5082
5083DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5084{
5085 RT_NOREF(fFcw);
5086 Assert(cBits > 64);
5087# if 0 /* rounding does not seem to help */
5088 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5089 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5090 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5091 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5092 {
5093 uint64_t uOld = r128.v[0];
5094 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5095 if (r128.v[0] < uOld)
5096 r128.v[1] += 1;
5097 }
5098# else
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100# endif
5101 return r128;
5102}
5103
5104
5105DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5106{
5107 RT_NOREF(fFcw);
5108 Assert(cBits > 64);
5109# if 0 /* rounding does not seem to help, not even on constants */
5110 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5111 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5112 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5113 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5114 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5115 {
5116 uint64_t uOld = r128.v[0];
5117 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5118 if (r128.v[0] < uOld)
5119 r128.v[1] += 1;
5120 }
5121 return r128;
5122# else
5123 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5124 return r128;
5125# endif
5126}
5127
5128
5129# if 0 /* unused */
5130DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5131{
5132 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5133 return r128;
5134}
5135# endif
5136
5137
5138/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5139DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5140{
5141 extFloat80_t Tmp;
5142 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5143 Tmp.signif = pr80Val->s2.uMantissa;
5144 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5145 return extF80_to_f128(Tmp, &Ignored);
5146}
5147
5148
5149/**
5150 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5151 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5152 *
5153 * This is only a structure format conversion, nothing else.
5154 */
5155DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5156{
5157 extFloat80_t Tmp;
5158 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5159 Tmp.signif = pr80Val->s2.uMantissa;
5160 return Tmp;
5161}
5162
5163
5164/**
5165 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5166 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5167 *
5168 * This is only a structure format conversion, nothing else.
5169 */
5170DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5171{
5172 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5173 pr80Dst->s2.uMantissa = r80XSrc.signif;
5174 return pr80Dst;
5175}
5176
5177
5178DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5179{
5180 RT_NOREF(fFcw);
5181 RTFLOAT128U Tmp;
5182 *(float128_t *)&Tmp = r128Src;
5183 ASMCompilerBarrier();
5184
5185 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5186 {
5187 pr80Dst->s.fSign = Tmp.s64.fSign;
5188 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5189 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5190 | Tmp.s64.uFractionLo >> (64 - 15);
5191
5192 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5193 unsigned const cShiftOff = 64 - 15;
5194 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5195 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5196 if (uRoundedOff)
5197 {
5198 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5199 ? RT_BIT_64(cShiftOff - 1)
5200 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5201 ? fRoundingOffMask
5202 : 0;
5203 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5204 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5205 || uRoundedOff != uRoundingAdd)
5206 {
5207 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5208 {
5209 uFraction += 1;
5210 if (!(uFraction & RT_BIT_64(63)))
5211 { /* likely */ }
5212 else
5213 {
5214 uFraction >>= 1;
5215 pr80Dst->s.uExponent++;
5216 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5217 return fFsw;
5218 }
5219 fFsw |= X86_FSW_C1;
5220 }
5221 }
5222 fFsw |= X86_FSW_PE;
5223 if (!(fFcw & X86_FCW_PM))
5224 fFsw |= X86_FSW_ES | X86_FSW_B;
5225 }
5226
5227 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5228 }
5229 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5230 {
5231 pr80Dst->s.fSign = Tmp.s64.fSign;
5232 pr80Dst->s.uExponent = 0;
5233 pr80Dst->s.uMantissa = 0;
5234 }
5235 else if (RTFLOAT128U_IS_INF(&Tmp))
5236 {
5237 pr80Dst->s.fSign = Tmp.s64.fSign;
5238 pr80Dst->s.uExponent = 0;
5239 pr80Dst->s.uMantissa = 0;
5240 }
5241 return fFsw;
5242}
5243
5244
5245/**
5246 * Helper for transfering exception and C1 to FSW and setting the result value
5247 * accordingly.
5248 *
5249 * @returns Updated FSW.
5250 * @param pSoftState The SoftFloat state following the operation.
5251 * @param r80XResult The result of the SoftFloat operation.
5252 * @param pr80Result Where to store the result for IEM.
5253 * @param fFcw The FPU control word.
5254 * @param fFsw The FSW before the operation, with necessary bits
5255 * cleared and such.
5256 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5257 * raised.
5258 */
5259DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5260 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5261 PCRTFLOAT80U pr80XcptResult)
5262{
5263 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5264 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5265 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5266 fFsw |= X86_FSW_ES | X86_FSW_B;
5267
5268 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5269 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5270 else
5271 {
5272 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5273 *pr80Result = *pr80XcptResult;
5274 }
5275 return fFsw;
5276}
5277
5278
5279/**
5280 * Helper doing polynomial evaluation using Horner's method.
5281 *
5282 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5283 */
5284float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5285 unsigned cPrecision, softfloat_state_t *pSoftState)
5286{
5287 Assert(cHornerConsts > 1);
5288 size_t i = cHornerConsts - 1;
5289 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5290 while (i-- > 0)
5291 {
5292 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5293 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5294 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5295 }
5296 return r128Result;
5297}
5298
5299#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5300
5301
5302/**
5303 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5304 * mantissa, exponent and sign.
5305 *
5306 * @returns Updated FSW.
5307 * @param pr80Dst Where to return the composed value.
5308 * @param fSign The sign.
5309 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5310 * ignored and should be zero. This will probably be
5311 * modified during normalization and rounding.
5312 * @param iExponent Unbiased exponent.
5313 * @param fFcw The FPU control word.
5314 * @param fFsw The FPU status word.
5315 */
5316static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5317 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5318{
5319 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5320
5321 iExponent += RTFLOAT80U_EXP_BIAS;
5322
5323 /* Do normalization if necessary and possible. */
5324 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5325 {
5326 int cShift = 192 - RTUInt256BitCount(puMantissa);
5327 if (iExponent > cShift)
5328 iExponent -= cShift;
5329 else
5330 {
5331 if (fFcw & X86_FCW_UM)
5332 {
5333 if (iExponent > 0)
5334 cShift = --iExponent;
5335 else
5336 cShift = 0;
5337 }
5338 iExponent -= cShift;
5339 }
5340 RTUInt256AssignShiftLeft(puMantissa, cShift);
5341 }
5342
5343 /* Do rounding. */
5344 uint64_t uMantissa = puMantissa->QWords.qw2;
5345 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5346 {
5347 bool fAdd;
5348 switch (fFcw & X86_FCW_RC_MASK)
5349 {
5350 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5351 case X86_FCW_RC_NEAREST:
5352 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5353 {
5354 if ( (uMantissa & 1)
5355 || puMantissa->QWords.qw0 != 0
5356 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5357 {
5358 fAdd = true;
5359 break;
5360 }
5361 uMantissa &= ~(uint64_t)1;
5362 }
5363 fAdd = false;
5364 break;
5365 case X86_FCW_RC_ZERO:
5366 fAdd = false;
5367 break;
5368 case X86_FCW_RC_UP:
5369 fAdd = !fSign;
5370 break;
5371 case X86_FCW_RC_DOWN:
5372 fAdd = fSign;
5373 break;
5374 }
5375 if (fAdd)
5376 {
5377 uint64_t const uTmp = uMantissa;
5378 uMantissa = uTmp + 1;
5379 if (uMantissa < uTmp)
5380 {
5381 uMantissa >>= 1;
5382 uMantissa |= RT_BIT_64(63);
5383 iExponent++;
5384 }
5385 fFsw |= X86_FSW_C1;
5386 }
5387 fFsw |= X86_FSW_PE;
5388 if (!(fFcw & X86_FCW_PM))
5389 fFsw |= X86_FSW_ES | X86_FSW_B;
5390 }
5391
5392 /* Check for underflow (denormals). */
5393 if (iExponent <= 0)
5394 {
5395 if (fFcw & X86_FCW_UM)
5396 {
5397 if (uMantissa & RT_BIT_64(63))
5398 uMantissa >>= 1;
5399 iExponent = 0;
5400 }
5401 else
5402 {
5403 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5404 fFsw |= X86_FSW_ES | X86_FSW_B;
5405 }
5406 fFsw |= X86_FSW_UE;
5407 }
5408 /* Check for overflow */
5409 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5410 {
5411 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5412 }
5413
5414 /* Compose the result. */
5415 pr80Dst->s.uMantissa = uMantissa;
5416 pr80Dst->s.uExponent = iExponent;
5417 pr80Dst->s.fSign = fSign;
5418 return fFsw;
5419}
5420
5421
5422/**
5423 * See also iemAImpl_fld_r80_from_r32
5424 */
5425static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5426{
5427 uint16_t fFsw = 0;
5428 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5429 {
5430 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5431 pr80Dst->sj64.fInteger = 1;
5432 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5433 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5434 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5435 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5436 }
5437 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5438 {
5439 pr80Dst->s.fSign = pr32Val->s.fSign;
5440 pr80Dst->s.uExponent = 0;
5441 pr80Dst->s.uMantissa = 0;
5442 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5443 }
5444 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5445 {
5446 /* Subnormal -> normalized + X86_FSW_DE return. */
5447 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5448 pr80Dst->sj64.fInteger = 1;
5449 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5450 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5451 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5452 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5453 fFsw = X86_FSW_DE;
5454 }
5455 else if (RTFLOAT32U_IS_INF(pr32Val))
5456 {
5457 pr80Dst->s.fSign = pr32Val->s.fSign;
5458 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5459 pr80Dst->s.uMantissa = RT_BIT_64(63);
5460 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5461 }
5462 else
5463 {
5464 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5465 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5466 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5467 pr80Dst->sj64.fInteger = 1;
5468 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5469 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5470 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5471 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5472 }
5473 return fFsw;
5474}
5475
5476
5477/**
5478 * See also iemAImpl_fld_r80_from_r64
5479 */
5480static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5481{
5482 uint16_t fFsw = 0;
5483 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5484 {
5485 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5486 pr80Dst->sj64.fInteger = 1;
5487 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5488 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5489 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5490 }
5491 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5492 {
5493 pr80Dst->s.fSign = pr64Val->s.fSign;
5494 pr80Dst->s.uExponent = 0;
5495 pr80Dst->s.uMantissa = 0;
5496 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5497 }
5498 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5499 {
5500 /* Subnormal values gets normalized. */
5501 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5502 pr80Dst->sj64.fInteger = 1;
5503 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5504 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5505 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5506 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5507 fFsw = X86_FSW_DE;
5508 }
5509 else if (RTFLOAT64U_IS_INF(pr64Val))
5510 {
5511 pr80Dst->s.fSign = pr64Val->s.fSign;
5512 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5513 pr80Dst->s.uMantissa = RT_BIT_64(63);
5514 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5515 }
5516 else
5517 {
5518 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5519 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5520 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5521 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5522 pr80Dst->sj64.fInteger = 1;
5523 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5524 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5525 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5526 }
5527 return fFsw;
5528}
5529
5530
5531/**
5532 * See also EMIT_FILD.
5533 */
5534#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5535static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5536{ \
5537 if (iVal == 0) \
5538 { \
5539 pr80Dst->s.fSign = 0; \
5540 pr80Dst->s.uExponent = 0; \
5541 pr80Dst->s.uMantissa = 0; \
5542 } \
5543 else \
5544 { \
5545 if (iVal > 0) \
5546 pr80Dst->s.fSign = 0; \
5547 else \
5548 { \
5549 pr80Dst->s.fSign = 1; \
5550 iVal = -iVal; \
5551 } \
5552 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5553 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5554 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5555 } \
5556 return pr80Dst; \
5557}
5558EMIT_CONVERT_IXX_TO_R80(16)
5559EMIT_CONVERT_IXX_TO_R80(32)
5560//EMIT_CONVERT_IXX_TO_R80(64)
5561
5562/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5563#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5564IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5565{ \
5566 RTFLOAT80U r80Val2; \
5567 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5568 Assert(!fFsw || fFsw == X86_FSW_DE); \
5569 if (fFsw) \
5570 { \
5571 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5572 fFsw = 0; \
5573 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5574 { \
5575 pFpuRes->r80Result = *pr80Val1; \
5576 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5577 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5578 return; \
5579 } \
5580 } \
5581 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5582 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5583}
5584
5585/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5586#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5587IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5588{ \
5589 RTFLOAT80U r80Val2; \
5590 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5591 Assert(!fFsw || fFsw == X86_FSW_DE); \
5592 if (fFsw) \
5593 { \
5594 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5595 fFsw = 0; \
5596 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5597 { \
5598 pFpuRes->r80Result = *pr80Val1; \
5599 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5600 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5601 return; \
5602 } \
5603 } \
5604 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5605 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5606}
5607
5608/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5609#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5610IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5611{ \
5612 RTFLOAT80U r80Val2; \
5613 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5614 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5615}
5616
5617/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5618#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5619IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5620{ \
5621 RTFLOAT80U r80Val2; \
5622 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5623 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5624}
5625
5626
5627
5628/*********************************************************************************************************************************
5629* x86 FPU Division Operations *
5630*********************************************************************************************************************************/
5631
5632/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5633static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5634 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5635{
5636 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5637 {
5638 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5639 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5640 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5641 }
5642 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5643 { /* Div by zero. */
5644 if (fFcw & X86_FCW_ZM)
5645 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5646 else
5647 {
5648 *pr80Result = *pr80Val1Org;
5649 fFsw |= X86_FSW_ES | X86_FSW_B;
5650 }
5651 fFsw |= X86_FSW_ZE;
5652 }
5653 else
5654 { /* Invalid operand */
5655 if (fFcw & X86_FCW_IM)
5656 *pr80Result = g_r80Indefinite;
5657 else
5658 {
5659 *pr80Result = *pr80Val1Org;
5660 fFsw |= X86_FSW_ES | X86_FSW_B;
5661 }
5662 fFsw |= X86_FSW_IE;
5663 }
5664 return fFsw;
5665}
5666
5667
5668IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5669 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5670{
5671 uint16_t const fFcw = pFpuState->FCW;
5672 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5673
5674 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5675 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5676 {
5677 if (fFcw & X86_FCW_IM)
5678 pFpuRes->r80Result = g_r80Indefinite;
5679 else
5680 {
5681 pFpuRes->r80Result = *pr80Val1;
5682 fFsw |= X86_FSW_ES | X86_FSW_B;
5683 }
5684 fFsw |= X86_FSW_IE;
5685 }
5686 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5687 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5688 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5689 {
5690 if (fFcw & X86_FCW_DM)
5691 {
5692 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5693 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5694 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5695 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5696 }
5697 else
5698 {
5699 pFpuRes->r80Result = *pr80Val1;
5700 fFsw |= X86_FSW_ES | X86_FSW_B;
5701 }
5702 fFsw |= X86_FSW_DE;
5703 }
5704 /* SoftFloat can handle the rest: */
5705 else
5706 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5707
5708 pFpuRes->FSW = fFsw;
5709}
5710
5711
5712EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5713EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5714EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5715EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5716
5717
5718IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5719 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5720{
5721 uint16_t const fFcw = pFpuState->FCW;
5722 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5723
5724 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5725 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5726 {
5727 if (fFcw & X86_FCW_IM)
5728 pFpuRes->r80Result = g_r80Indefinite;
5729 else
5730 {
5731 pFpuRes->r80Result = *pr80Val1;
5732 fFsw |= X86_FSW_ES | X86_FSW_B;
5733 }
5734 fFsw |= X86_FSW_IE;
5735 }
5736 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5737 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5738 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5739 {
5740 if (fFcw & X86_FCW_DM)
5741 {
5742 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5743 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5744 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5745 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5746 }
5747 else
5748 {
5749 pFpuRes->r80Result = *pr80Val1;
5750 fFsw |= X86_FSW_ES | X86_FSW_B;
5751 }
5752 fFsw |= X86_FSW_DE;
5753 }
5754 /* SoftFloat can handle the rest: */
5755 else
5756 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5757
5758 pFpuRes->FSW = fFsw;
5759}
5760
5761
5762EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5763EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5764EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5765EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5766
5767
5768/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5769static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5770 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5771{
5772 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5773 {
5774 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5775 uint16_t fCxFlags = 0;
5776 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5777 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5778 &fCxFlags, &SoftState);
5779 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5780 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5781 if ( !(fFsw & X86_FSW_IE)
5782 && !RTFLOAT80U_IS_NAN(pr80Result)
5783 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5784 {
5785 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5786 fFsw |= fCxFlags & X86_FSW_C_MASK;
5787 }
5788 return fFsw;
5789 }
5790
5791 /* Invalid operand */
5792 if (fFcw & X86_FCW_IM)
5793 *pr80Result = g_r80Indefinite;
5794 else
5795 {
5796 *pr80Result = *pr80Val1Org;
5797 fFsw |= X86_FSW_ES | X86_FSW_B;
5798 }
5799 return fFsw | X86_FSW_IE;
5800}
5801
5802
5803static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5804 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5805{
5806 uint16_t const fFcw = pFpuState->FCW;
5807 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5808
5809 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5810 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5811 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5812 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5813 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5814 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5815 {
5816 if (fFcw & X86_FCW_IM)
5817 pFpuRes->r80Result = g_r80Indefinite;
5818 else
5819 {
5820 pFpuRes->r80Result = *pr80Val1;
5821 fFsw |= X86_FSW_ES | X86_FSW_B;
5822 }
5823 fFsw |= X86_FSW_IE;
5824 }
5825 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5826 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5827 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5828 {
5829 if (fFcw & X86_FCW_DM)
5830 {
5831 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5832 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5833 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5834 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5835 pr80Val1Org, fLegacyInstr);
5836 }
5837 else
5838 {
5839 pFpuRes->r80Result = *pr80Val1;
5840 fFsw |= X86_FSW_ES | X86_FSW_B;
5841 }
5842 fFsw |= X86_FSW_DE;
5843 }
5844 /* SoftFloat can handle the rest: */
5845 else
5846 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5847 pr80Val1, fLegacyInstr);
5848
5849 pFpuRes->FSW = fFsw;
5850}
5851
5852
5853IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5854 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5855{
5856 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5857}
5858
5859
5860IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5861 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5862{
5863 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5864}
5865
5866
5867/*********************************************************************************************************************************
5868* x87 FPU Multiplication Operations *
5869*********************************************************************************************************************************/
5870
5871/** Worker for iemAImpl_fmul_r80_by_r80. */
5872static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5873 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5874{
5875 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5876 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5877 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5878}
5879
5880
5881IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5882 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5883{
5884 uint16_t const fFcw = pFpuState->FCW;
5885 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5886
5887 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5888 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5889 {
5890 if (fFcw & X86_FCW_IM)
5891 pFpuRes->r80Result = g_r80Indefinite;
5892 else
5893 {
5894 pFpuRes->r80Result = *pr80Val1;
5895 fFsw |= X86_FSW_ES | X86_FSW_B;
5896 }
5897 fFsw |= X86_FSW_IE;
5898 }
5899 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5900 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5901 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5902 {
5903 if (fFcw & X86_FCW_DM)
5904 {
5905 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5906 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5907 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5908 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5909 }
5910 else
5911 {
5912 pFpuRes->r80Result = *pr80Val1;
5913 fFsw |= X86_FSW_ES | X86_FSW_B;
5914 }
5915 fFsw |= X86_FSW_DE;
5916 }
5917 /* SoftFloat can handle the rest: */
5918 else
5919 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5920
5921 pFpuRes->FSW = fFsw;
5922}
5923
5924
5925EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5926EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5927EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5928EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5929
5930
5931/*********************************************************************************************************************************
5932* x87 FPU Addition *
5933*********************************************************************************************************************************/
5934
5935/** Worker for iemAImpl_fadd_r80_by_r80. */
5936static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5937 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5938{
5939 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5940 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5941 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5942}
5943
5944
5945IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5946 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5947{
5948 uint16_t const fFcw = pFpuState->FCW;
5949 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5950
5951 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5952 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5953 {
5954 if (fFcw & X86_FCW_IM)
5955 pFpuRes->r80Result = g_r80Indefinite;
5956 else
5957 {
5958 pFpuRes->r80Result = *pr80Val1;
5959 fFsw |= X86_FSW_ES | X86_FSW_B;
5960 }
5961 fFsw |= X86_FSW_IE;
5962 }
5963 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5964 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5965 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5966 {
5967 if (fFcw & X86_FCW_DM)
5968 {
5969 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5970 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5971 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5972 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5973 }
5974 else
5975 {
5976 pFpuRes->r80Result = *pr80Val1;
5977 fFsw |= X86_FSW_ES | X86_FSW_B;
5978 }
5979 fFsw |= X86_FSW_DE;
5980 }
5981 /* SoftFloat can handle the rest: */
5982 else
5983 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5984
5985 pFpuRes->FSW = fFsw;
5986}
5987
5988
5989EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
5990EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
5991EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
5992EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
5993
5994
5995/*********************************************************************************************************************************
5996* x87 FPU Subtraction *
5997*********************************************************************************************************************************/
5998
5999/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6000static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6001 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6002{
6003 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6004 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6005 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6006}
6007
6008
6009IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6010 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6011{
6012 uint16_t const fFcw = pFpuState->FCW;
6013 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6014
6015 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6016 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6017 {
6018 if (fFcw & X86_FCW_IM)
6019 pFpuRes->r80Result = g_r80Indefinite;
6020 else
6021 {
6022 pFpuRes->r80Result = *pr80Val1;
6023 fFsw |= X86_FSW_ES | X86_FSW_B;
6024 }
6025 fFsw |= X86_FSW_IE;
6026 }
6027 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6028 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6029 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6030 {
6031 if (fFcw & X86_FCW_DM)
6032 {
6033 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6034 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6035 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6036 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6037 }
6038 else
6039 {
6040 pFpuRes->r80Result = *pr80Val1;
6041 fFsw |= X86_FSW_ES | X86_FSW_B;
6042 }
6043 fFsw |= X86_FSW_DE;
6044 }
6045 /* SoftFloat can handle the rest: */
6046 else
6047 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6048
6049 pFpuRes->FSW = fFsw;
6050}
6051
6052
6053EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6054EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6055EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6056EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6057
6058
6059/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6060IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6061 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6062{
6063 uint16_t const fFcw = pFpuState->FCW;
6064 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6065
6066 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6067 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6068 {
6069 if (fFcw & X86_FCW_IM)
6070 pFpuRes->r80Result = g_r80Indefinite;
6071 else
6072 {
6073 pFpuRes->r80Result = *pr80Val1;
6074 fFsw |= X86_FSW_ES | X86_FSW_B;
6075 }
6076 fFsw |= X86_FSW_IE;
6077 }
6078 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6079 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6080 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6081 {
6082 if (fFcw & X86_FCW_DM)
6083 {
6084 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6085 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6086 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6087 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6088 }
6089 else
6090 {
6091 pFpuRes->r80Result = *pr80Val1;
6092 fFsw |= X86_FSW_ES | X86_FSW_B;
6093 }
6094 fFsw |= X86_FSW_DE;
6095 }
6096 /* SoftFloat can handle the rest: */
6097 else
6098 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6099
6100 pFpuRes->FSW = fFsw;
6101}
6102
6103
6104EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6105EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6106EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6107EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6108
6109
6110/*********************************************************************************************************************************
6111* x87 FPU Trigometric Operations *
6112*********************************************************************************************************************************/
6113
6114
6115IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6116 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6117{
6118 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6119 AssertReleaseFailed();
6120}
6121
6122#endif /* IEM_WITHOUT_ASSEMBLY */
6123
6124IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6125 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6126{
6127 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6128}
6129
6130IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6131 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6132{
6133 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6134}
6135
6136
6137#if defined(IEM_WITHOUT_ASSEMBLY)
6138IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6139{
6140 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6141 AssertReleaseFailed();
6142}
6143#endif /* IEM_WITHOUT_ASSEMBLY */
6144
6145IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6146{
6147 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6148}
6149
6150IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6151{
6152 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6153}
6154
6155
6156#ifdef IEM_WITHOUT_ASSEMBLY
6157IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6158{
6159 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6160 AssertReleaseFailed();
6161}
6162#endif /* IEM_WITHOUT_ASSEMBLY */
6163
6164IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6165{
6166 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6167}
6168
6169IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6170{
6171 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6172}
6173
6174#ifdef IEM_WITHOUT_ASSEMBLY
6175IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6176{
6177 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6178 AssertReleaseFailed();
6179}
6180#endif /* IEM_WITHOUT_ASSEMBLY */
6181
6182IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6183{
6184 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6185}
6186
6187IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6188{
6189 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6190}
6191
6192
6193#ifdef IEM_WITHOUT_ASSEMBLY
6194IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6195{
6196 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6197 AssertReleaseFailed();
6198}
6199#endif /* IEM_WITHOUT_ASSEMBLY */
6200
6201IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6202{
6203 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6204}
6205
6206IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6207{
6208 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6209}
6210
6211#ifdef IEM_WITHOUT_ASSEMBLY
6212
6213
6214/*********************************************************************************************************************************
6215* x87 FPU Compare and Testing Operations *
6216*********************************************************************************************************************************/
6217
6218IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6219{
6220 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6221
6222 if (RTFLOAT80U_IS_ZERO(pr80Val))
6223 fFsw |= X86_FSW_C3;
6224 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6225 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6226 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6227 {
6228 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6229 if (!(pFpuState->FCW & X86_FCW_DM))
6230 fFsw |= X86_FSW_ES | X86_FSW_B;
6231 }
6232 else
6233 {
6234 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6235 if (!(pFpuState->FCW & X86_FCW_IM))
6236 fFsw |= X86_FSW_ES | X86_FSW_B;
6237 }
6238
6239 *pu16Fsw = fFsw;
6240}
6241
6242
6243IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6244{
6245 RT_NOREF(pFpuState);
6246 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6247
6248 /* C1 = sign bit (always, even if empty Intel says). */
6249 if (pr80Val->s.fSign)
6250 fFsw |= X86_FSW_C1;
6251
6252 /* Classify the value in C0, C2, C3. */
6253 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6254 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6255 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6256 fFsw |= X86_FSW_C2;
6257 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6258 fFsw |= X86_FSW_C3;
6259 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6260 fFsw |= X86_FSW_C0;
6261 else if (RTFLOAT80U_IS_INF(pr80Val))
6262 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6263 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6264 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6265 /* whatever else: 0 */
6266
6267 *pu16Fsw = fFsw;
6268}
6269
6270
6271/**
6272 * Worker for fcom, fucom, and friends.
6273 */
6274static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6275 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6276{
6277 /*
6278 * Unpack the values.
6279 */
6280 bool const fSign1 = pr80Val1->s.fSign;
6281 int32_t iExponent1 = pr80Val1->s.uExponent;
6282 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6283
6284 bool const fSign2 = pr80Val2->s.fSign;
6285 int32_t iExponent2 = pr80Val2->s.uExponent;
6286 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6287
6288 /*
6289 * Check for invalid inputs.
6290 */
6291 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6292 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6293 {
6294 if (!(fFcw & X86_FCW_IM))
6295 fFsw |= X86_FSW_ES | X86_FSW_B;
6296 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6297 }
6298
6299 /*
6300 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6301 */
6302 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6303 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6304 {
6305 if ( fIeOnAllNaNs
6306 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6307 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6308 {
6309 fFsw |= X86_FSW_IE;
6310 if (!(fFcw & X86_FCW_IM))
6311 fFsw |= X86_FSW_ES | X86_FSW_B;
6312 }
6313 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6314 }
6315
6316 /*
6317 * Normalize the values.
6318 */
6319 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6320 {
6321 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6322 iExponent1 = 1;
6323 else
6324 {
6325 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6326 uMantissa1 <<= iExponent1;
6327 iExponent1 = 1 - iExponent1;
6328 }
6329 fFsw |= X86_FSW_DE;
6330 if (!(fFcw & X86_FCW_DM))
6331 fFsw |= X86_FSW_ES | X86_FSW_B;
6332 }
6333
6334 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6335 {
6336 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6337 iExponent2 = 1;
6338 else
6339 {
6340 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6341 uMantissa2 <<= iExponent2;
6342 iExponent2 = 1 - iExponent2;
6343 }
6344 fFsw |= X86_FSW_DE;
6345 if (!(fFcw & X86_FCW_DM))
6346 fFsw |= X86_FSW_ES | X86_FSW_B;
6347 }
6348
6349 /*
6350 * Test if equal (val1 == val2):
6351 */
6352 if ( uMantissa1 == uMantissa2
6353 && iExponent1 == iExponent2
6354 && ( fSign1 == fSign2
6355 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6356 fFsw |= X86_FSW_C3;
6357 /*
6358 * Test if less than (val1 < val2):
6359 */
6360 else if (fSign1 && !fSign2)
6361 fFsw |= X86_FSW_C0;
6362 else if (fSign1 == fSign2)
6363 {
6364 /* Zeros are problematic, however at the most one can be zero here. */
6365 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6366 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6367 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6368 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6369
6370 if ( fSign1
6371 ^ ( iExponent1 < iExponent2
6372 || ( iExponent1 == iExponent2
6373 && uMantissa1 < uMantissa2 ) ) )
6374 fFsw |= X86_FSW_C0;
6375 }
6376 /* else: No flags set if greater. */
6377
6378 return fFsw;
6379}
6380
6381
6382IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6383 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6384{
6385 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6386}
6387
6388
6389
6390
6391IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6392 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6393{
6394 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6395}
6396
6397
6398IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6399 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6400{
6401 RTFLOAT80U r80Val2;
6402 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6403 Assert(!fFsw || fFsw == X86_FSW_DE);
6404 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6405 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6406 {
6407 if (!(pFpuState->FCW & X86_FCW_DM))
6408 fFsw |= X86_FSW_ES | X86_FSW_B;
6409 *pfFsw |= fFsw;
6410 }
6411}
6412
6413
6414IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6415 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6416{
6417 RTFLOAT80U r80Val2;
6418 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6419 Assert(!fFsw || fFsw == X86_FSW_DE);
6420 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6421 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6422 {
6423 if (!(pFpuState->FCW & X86_FCW_DM))
6424 fFsw |= X86_FSW_ES | X86_FSW_B;
6425 *pfFsw |= fFsw;
6426 }
6427}
6428
6429
6430IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6431 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6432{
6433 RTFLOAT80U r80Val2;
6434 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6435 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6436}
6437
6438
6439IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6440 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6441{
6442 RTFLOAT80U r80Val2;
6443 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6444 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6445}
6446
6447
6448/**
6449 * Worker for fcomi & fucomi.
6450 */
6451static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6452 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6453{
6454 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6455 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6456 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6457 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6458
6459 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6460 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6461 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6462}
6463
6464
6465IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6466 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6467{
6468 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6469}
6470
6471
6472IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6473 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6474{
6475 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6476}
6477
6478
6479/*********************************************************************************************************************************
6480* x87 FPU Other Operations *
6481*********************************************************************************************************************************/
6482
6483/**
6484 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6485 */
6486static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6487{
6488 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6489 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6490 true /*exact / generate #PE */, &SoftState));
6491 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6492}
6493
6494
6495IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6496{
6497 uint16_t const fFcw = pFpuState->FCW;
6498 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6499
6500 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6501 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6502 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6503 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6504 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6505 || RTFLOAT80U_IS_INF(pr80Val))
6506 pFpuRes->r80Result = *pr80Val;
6507 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6508 {
6509 fFsw |= X86_FSW_DE;
6510 if (fFcw & X86_FCW_DM)
6511 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6512 else
6513 {
6514 pFpuRes->r80Result = *pr80Val;
6515 fFsw |= X86_FSW_ES | X86_FSW_B;
6516 }
6517 }
6518 else
6519 {
6520 if (fFcw & X86_FCW_IM)
6521 {
6522 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6523 pFpuRes->r80Result = g_r80Indefinite;
6524 else
6525 {
6526 pFpuRes->r80Result = *pr80Val;
6527 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6528 }
6529 }
6530 else
6531 {
6532 pFpuRes->r80Result = *pr80Val;
6533 fFsw |= X86_FSW_ES | X86_FSW_B;
6534 }
6535 fFsw |= X86_FSW_IE;
6536 }
6537 pFpuRes->FSW = fFsw;
6538}
6539
6540
6541IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6542 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6543{
6544 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6545 it does everything we need it to do. */
6546 uint16_t const fFcw = pFpuState->FCW;
6547 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6548 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6549 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6550 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6551}
6552
6553
6554/**
6555 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6556 */
6557static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6558{
6559 Assert(!pr80Val->s.fSign);
6560 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6561 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6562 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6563}
6564
6565
6566IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6567{
6568 uint16_t const fFcw = pFpuState->FCW;
6569 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6570
6571 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6572 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6573 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6574 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6575 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6576 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6577 pFpuRes->r80Result = *pr80Val;
6578 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6579 {
6580 fFsw |= X86_FSW_DE;
6581 if (fFcw & X86_FCW_DM)
6582 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6583 else
6584 {
6585 pFpuRes->r80Result = *pr80Val;
6586 fFsw |= X86_FSW_ES | X86_FSW_B;
6587 }
6588 }
6589 else
6590 {
6591 if (fFcw & X86_FCW_IM)
6592 {
6593 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6594 pFpuRes->r80Result = g_r80Indefinite;
6595 else
6596 {
6597 pFpuRes->r80Result = *pr80Val;
6598 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6599 }
6600 }
6601 else
6602 {
6603 pFpuRes->r80Result = *pr80Val;
6604 fFsw |= X86_FSW_ES | X86_FSW_B;
6605 }
6606 fFsw |= X86_FSW_IE;
6607 }
6608 pFpuRes->FSW = fFsw;
6609}
6610
6611
6612/**
6613 * @code{.unparsed}
6614 * x x * ln2
6615 * f(x) = 2 - 1 = e - 1
6616 *
6617 * @endcode
6618 *
6619 * We can approximate e^x by a Taylor/Maclaurin series (see
6620 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
6621 * @code{.unparsed}
6622 * n 0 1 2 3 4
6623 * inf x x x x x x
6624 * SUM ----- = --- + --- + --- + --- + --- + ...
6625 * n=0 n! 0! 1! 2! 3! 4!
6626 *
6627 * 2 3 4
6628 * x x x
6629 * = 1 + x + --- + --- + --- + ...
6630 * 2! 3! 4!
6631 * @endcode
6632 *
6633 * Given z = x * ln2, we get:
6634 * @code{.unparsed}
6635 * 2 3 4 n
6636 * z z z z z
6637 * e - 1 = z + --- + --- + --- + ... + ---
6638 * 2! 3! 4! n!
6639 * @endcode
6640 *
6641 * Wanting to use Horner's method, we move one z outside and get:
6642 * @code{.unparsed}
6643 * 2 3 (n-1)
6644 * z z z z
6645 * = z ( 1 + --- + --- + --- + ... + ------- )
6646 * 2! 3! 4! n!
6647 * @endcode
6648 *
6649 * The constants we need for using Horner's methods are 1 and 1 / n!.
6650 *
6651 * For very tiny x values, we can get away with f(x) = x * ln 2, because
6652 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
6653 * and can approximate it to be 1.0. For a visual demonstration of this
6654 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
6655 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
6656 *
6657 *
6658 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
6659 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
6660 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
6661 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
6662 * blocks). (The one bit difference is probably an implicit one missing from
6663 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
6664 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
6665 * exponent.
6666 *
6667 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
6668 * successfully reproduced the exact results from an Intel 10980XE, there is
6669 * always a portition of rounding differences. Not going to spend too much time
6670 * on getting this 100% the same, at least not now.
6671 *
6672 * P.S. If someone are really curious about 8087 and its contstants:
6673 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
6674 *
6675 *
6676 * @param pr80Val The exponent value (x), less than 1.0, greater than
6677 * -1.0 and not zero. This can be a normal, denormal
6678 * or pseudo-denormal value.
6679 * @param pr80Result Where to return the result.
6680 * @param fFcw FPU control word.
6681 * @param fFsw FPU status word.
6682 */
6683static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6684{
6685 /* As mentioned above, we can skip the expensive polynomial calculation
6686 as it will be close enough to 1.0 that it makes no difference.
6687
6688 The cutoff point for intel 10980XE is exponents >= -69. Intel
6689 also seems to be using a 67-bit or 68-bit constant value, and we get
6690 a smattering of rounding differences if we go for higher precision. */
6691 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
6692 {
6693 RTUINT256U u256;
6694 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
6695 u256.QWords.qw0 |= 1; /* force #PE */
6696 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
6697 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
6698 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
6699 : 1 - RTFLOAT80U_EXP_BIAS,
6700 fFcw, fFsw);
6701 }
6702 else
6703 {
6704#ifdef IEM_WITH_FLOAT128_FOR_FPU
6705 /* This approach is not good enough for small values, we end up with zero. */
6706 int const fOldRounding = iemFpuF128SetRounding(fFcw);
6707 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
6708 _Float128 rd128Result = powf128(2.0L, rd128Val);
6709 rd128Result -= 1.0L;
6710 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
6711 iemFpuF128RestoreRounding(fOldRounding);
6712
6713# else
6714 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6715 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
6716
6717 /* As mentioned above, enforce 68-bit internal mantissa width to better
6718 match the Intel 10980XE results. */
6719 unsigned const cPrecision = 68;
6720
6721 /* first calculate z = x * ln2 */
6722 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
6723 cPrecision);
6724
6725 /* Then do the polynomial evaluation. */
6726 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
6727 cPrecision, &SoftState);
6728 r = f128_mul(z, r, &SoftState);
6729
6730 /* Output the result. */
6731 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
6732# endif
6733 }
6734 return fFsw;
6735}
6736
6737
6738IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6739{
6740 uint16_t const fFcw = pFpuState->FCW;
6741 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6742
6743 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6744 {
6745 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
6746 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6747 else
6748 {
6749 /* Special case:
6750 2^+1.0 - 1.0 = 1.0
6751 2^-1.0 - 1.0 = -0.5 */
6752 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
6753 && pr80Val->s.uMantissa == RT_BIT_64(63))
6754 {
6755 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
6756 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
6757 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6758 }
6759 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
6760 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
6761 else
6762 pFpuRes->r80Result = *pr80Val;
6763 fFsw |= X86_FSW_PE;
6764 if (!(fFcw & X86_FCW_PM))
6765 fFsw |= X86_FSW_ES | X86_FSW_B;
6766 }
6767 }
6768 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6769 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6770 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6771 pFpuRes->r80Result = *pr80Val;
6772 else if (RTFLOAT80U_IS_INF(pr80Val))
6773 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
6774 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6775 {
6776 fFsw |= X86_FSW_DE;
6777 if (fFcw & X86_FCW_DM)
6778 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6779 else
6780 {
6781 pFpuRes->r80Result = *pr80Val;
6782 fFsw |= X86_FSW_ES | X86_FSW_B;
6783 }
6784 }
6785 else
6786 {
6787 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6788 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6789 && (fFcw & X86_FCW_IM))
6790 pFpuRes->r80Result = g_r80Indefinite;
6791 else
6792 {
6793 pFpuRes->r80Result = *pr80Val;
6794 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6795 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6796 }
6797 fFsw |= X86_FSW_IE;
6798 if (!(fFcw & X86_FCW_IM))
6799 fFsw |= X86_FSW_ES | X86_FSW_B;
6800 }
6801 pFpuRes->FSW = fFsw;
6802}
6803
6804#endif /* IEM_WITHOUT_ASSEMBLY */
6805
6806IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6807{
6808 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6809}
6810
6811IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6812{
6813 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6814}
6815
6816#ifdef IEM_WITHOUT_ASSEMBLY
6817
6818IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6819{
6820 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6821 pFpuRes->r80Result = *pr80Val;
6822 pFpuRes->r80Result.s.fSign = 0;
6823}
6824
6825
6826IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6827{
6828 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6829 pFpuRes->r80Result = *pr80Val;
6830 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
6831}
6832
6833
6834IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6835{
6836 uint16_t const fFcw = pFpuState->FCW;
6837 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6838
6839 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6840 {
6841 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6842 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
6843
6844 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6845 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6846 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6847 }
6848 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6849 {
6850 fFsw |= X86_FSW_ZE;
6851 if (fFcw & X86_FCW_ZM)
6852 {
6853 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
6854 pFpuResTwo->r80Result2 = *pr80Val;
6855 }
6856 else
6857 {
6858 pFpuResTwo->r80Result2 = *pr80Val;
6859 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6860 }
6861 }
6862 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6863 {
6864 fFsw |= X86_FSW_DE;
6865 if (fFcw & X86_FCW_DM)
6866 {
6867 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6868 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6869 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6870 int32_t iExponent = -16382;
6871 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
6872 {
6873 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
6874 iExponent--;
6875 }
6876
6877 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6878 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
6879 }
6880 else
6881 {
6882 pFpuResTwo->r80Result2 = *pr80Val;
6883 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6884 }
6885 }
6886 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6887 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6888 {
6889 pFpuResTwo->r80Result1 = *pr80Val;
6890 pFpuResTwo->r80Result2 = *pr80Val;
6891 }
6892 else if (RTFLOAT80U_IS_INF(pr80Val))
6893 {
6894 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
6895 pFpuResTwo->r80Result2 = *pr80Val;
6896 }
6897 else
6898 {
6899 if (fFcw & X86_FCW_IM)
6900 {
6901 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6902 pFpuResTwo->r80Result1 = g_r80Indefinite;
6903 else
6904 {
6905 pFpuResTwo->r80Result1 = *pr80Val;
6906 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6907 }
6908 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
6909 }
6910 else
6911 {
6912 pFpuResTwo->r80Result2 = *pr80Val;
6913 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6914 }
6915 fFsw |= X86_FSW_IE;
6916 }
6917 pFpuResTwo->FSW = fFsw;
6918}
6919
6920
6921IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6922 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6923{
6924 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6925 AssertReleaseFailed();
6926}
6927
6928#endif /* IEM_WITHOUT_ASSEMBLY */
6929
6930IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6931 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6932{
6933 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6934}
6935
6936IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6937 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6938{
6939 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6940}
6941
6942#if defined(IEM_WITHOUT_ASSEMBLY)
6943
6944IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6945 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6946{
6947 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6948 AssertReleaseFailed();
6949}
6950
6951#endif /* IEM_WITHOUT_ASSEMBLY */
6952
6953IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6954 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6955{
6956 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6957}
6958
6959IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6960 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6961{
6962 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6963}
6964
6965
6966/*********************************************************************************************************************************
6967* MMX, SSE & AVX *
6968*********************************************************************************************************************************/
6969
6970/*
6971 * MOVSLDUP / VMOVSLDUP
6972 */
6973IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
6974{
6975 puDst->au32[0] = puSrc->au32[0];
6976 puDst->au32[1] = puSrc->au32[0];
6977 puDst->au32[2] = puSrc->au32[2];
6978 puDst->au32[3] = puSrc->au32[2];
6979}
6980
6981#ifdef IEM_WITH_VEX
6982
6983IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
6984{
6985 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
6986 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
6987 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
6988 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
6989 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6990 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6991 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6992 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6993}
6994
6995
6996IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
6997{
6998 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
6999 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7000 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7001 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7002 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7003 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7004 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7005 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7006}
7007
7008#endif /* IEM_WITH_VEX */
7009
7010
7011/*
7012 * MOVSHDUP / VMOVSHDUP
7013 */
7014IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7015{
7016 puDst->au32[0] = puSrc->au32[1];
7017 puDst->au32[1] = puSrc->au32[1];
7018 puDst->au32[2] = puSrc->au32[3];
7019 puDst->au32[3] = puSrc->au32[3];
7020}
7021
7022#ifdef IEM_WITH_VEX
7023
7024IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7025{
7026 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7027 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7028 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7029 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7030 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7031 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7032 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7033 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7034}
7035
7036
7037IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7038{
7039 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7040 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7041 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7042 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7043 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7044 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7045 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7046 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7047}
7048
7049#endif /* IEM_WITH_VEX */
7050
7051
7052/*
7053 * MOVDDUP / VMOVDDUP
7054 */
7055IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7056{
7057 puDst->au64[0] = uSrc;
7058 puDst->au64[1] = uSrc;
7059}
7060
7061#ifdef IEM_WITH_VEX
7062
7063IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7064{
7065 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7066 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7067 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7068 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7069}
7070
7071IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7072{
7073 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7074 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7075 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7076 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7077}
7078
7079#endif /* IEM_WITH_VEX */
7080
7081
7082/*
7083 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7084 */
7085#ifdef IEM_WITHOUT_ASSEMBLY
7086
7087IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7088{
7089 RT_NOREF(pFpuState);
7090 *puDst &= *puSrc;
7091}
7092
7093
7094IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7095{
7096 RT_NOREF(pFpuState);
7097 puDst->au64[0] &= puSrc->au64[0];
7098 puDst->au64[1] &= puSrc->au64[1];
7099}
7100
7101#endif
7102
7103IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7104 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7105{
7106 RT_NOREF(pExtState);
7107 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7108 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7109}
7110
7111
7112IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7113 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7114{
7115 RT_NOREF(pExtState);
7116 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7117 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7118 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7119 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7120}
7121
7122
7123/*
7124 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7125 */
7126#ifdef IEM_WITHOUT_ASSEMBLY
7127
7128IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7129{
7130 RT_NOREF(pFpuState);
7131 *puDst = ~*puDst & *puSrc;
7132}
7133
7134
7135IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7136{
7137 RT_NOREF(pFpuState);
7138 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7139 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7140}
7141
7142#endif
7143
7144IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7145 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7146{
7147 RT_NOREF(pExtState);
7148 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7149 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7150}
7151
7152
7153IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7154 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7155{
7156 RT_NOREF(pExtState);
7157 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7158 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7159 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7160 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7161}
7162
7163
7164/*
7165 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7166 */
7167#ifdef IEM_WITHOUT_ASSEMBLY
7168
7169IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7170{
7171 RT_NOREF(pFpuState);
7172 *puDst |= *puSrc;
7173}
7174
7175
7176IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7177{
7178 RT_NOREF(pFpuState);
7179 puDst->au64[0] |= puSrc->au64[0];
7180 puDst->au64[1] |= puSrc->au64[1];
7181}
7182
7183#endif
7184
7185IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7186 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7187{
7188 RT_NOREF(pExtState);
7189 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7190 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7191}
7192
7193
7194IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7195 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7196{
7197 RT_NOREF(pExtState);
7198 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7199 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7200 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7201 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7202}
7203
7204
7205/*
7206 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7207 */
7208#ifdef IEM_WITHOUT_ASSEMBLY
7209
7210IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7211{
7212 RT_NOREF(pFpuState);
7213 *puDst ^= *puSrc;
7214}
7215
7216
7217IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7218{
7219 RT_NOREF(pFpuState);
7220 puDst->au64[0] ^= puSrc->au64[0];
7221 puDst->au64[1] ^= puSrc->au64[1];
7222}
7223
7224#endif
7225
7226IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7227 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7228{
7229 RT_NOREF(pExtState);
7230 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7231 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7232}
7233
7234
7235IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7236 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7237{
7238 RT_NOREF(pExtState);
7239 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7240 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7241 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7242 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7243}
7244
7245
7246/*
7247 * PCMPEQB / VPCMPEQB
7248 */
7249#ifdef IEM_WITHOUT_ASSEMBLY
7250
7251IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7252{
7253 RT_NOREF(pFpuState);
7254 RTUINT64U uSrc1 = { *puDst };
7255 RTUINT64U uSrc2 = { *puSrc };
7256 RTUINT64U uDst;
7257 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7258 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7259 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7260 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7261 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7262 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7263 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7264 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7265 *puDst = uDst.u;
7266}
7267
7268
7269IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7270{
7271 RT_NOREF(pFpuState);
7272 RTUINT128U uSrc1 = *puDst;
7273 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7274 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7275 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7276 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7277 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7278 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7279 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7280 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7281 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7282 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7283 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7284 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7285 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7286 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7287 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7288 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7289}
7290
7291#endif
7292
7293IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7294 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7295{
7296 RT_NOREF(pExtState);
7297 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7298 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7299 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7300 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7301 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7302 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7303 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7304 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7305 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7306 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7307 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7308 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7309 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7310 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7311 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7312 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7313}
7314
7315IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7316 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7317{
7318 RT_NOREF(pExtState);
7319 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7320 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7321 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7322 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7323 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7324 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7325 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7326 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7327 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7328 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7329 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7330 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7331 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7332 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7333 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7334 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7335 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7336 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7337 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7338 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7339 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7340 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7341 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7342 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7343 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7344 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7345 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7346 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7347 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7348 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7349 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7350 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7351}
7352
7353
7354/*
7355 * PCMPEQW / VPCMPEQW
7356 */
7357#ifdef IEM_WITHOUT_ASSEMBLY
7358
7359IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7360{
7361 RT_NOREF(pFpuState);
7362 RTUINT64U uSrc1 = { *puDst };
7363 RTUINT64U uSrc2 = { *puSrc };
7364 RTUINT64U uDst;
7365 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7366 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7367 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7368 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7369 *puDst = uDst.u;
7370}
7371
7372
7373IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7374{
7375 RT_NOREF(pFpuState);
7376 RTUINT128U uSrc1 = *puDst;
7377 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7378 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7379 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7380 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7381 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7382 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7383 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7384 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7385}
7386
7387#endif
7388
7389IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7390 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7391{
7392 RT_NOREF(pExtState);
7393 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7394 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7395 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7396 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7397 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7398 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7399 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7400 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7401}
7402
7403IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7404 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7405{
7406 RT_NOREF(pExtState);
7407 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7408 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7409 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7410 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7411 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7412 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7413 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7414 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7415 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7416 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7417 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7418 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7419 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7420 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7421 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7422 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7423}
7424
7425
7426/*
7427 * PCMPEQD / VPCMPEQD.
7428 */
7429#ifdef IEM_WITHOUT_ASSEMBLY
7430
7431IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7432{
7433 RT_NOREF(pFpuState);
7434 RTUINT64U uSrc1 = { *puDst };
7435 RTUINT64U uSrc2 = { *puSrc };
7436 RTUINT64U uDst;
7437 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7438 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7439 *puDst = uDst.u;
7440}
7441
7442
7443IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7444{
7445 RT_NOREF(pFpuState);
7446 RTUINT128U uSrc1 = *puDst;
7447 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7448 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7449 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7450 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7451}
7452
7453#endif /* IEM_WITHOUT_ASSEMBLY */
7454
7455IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7456 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7457{
7458 RT_NOREF(pExtState);
7459 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7460 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7461 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7462 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7463}
7464
7465IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7466 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7467{
7468 RT_NOREF(pExtState);
7469 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7470 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7471 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7472 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7473 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
7474 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
7475 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
7476 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
7477}
7478
7479
7480/*
7481 * PCMPEQQ / VPCMPEQQ.
7482 */
7483IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7484{
7485 RT_NOREF(pFpuState);
7486 RTUINT128U uSrc1 = *puDst;
7487 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
7488 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
7489}
7490
7491IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7492 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7493{
7494 RT_NOREF(pExtState);
7495 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7496 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7497}
7498
7499IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7500 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7501{
7502 RT_NOREF(pExtState);
7503 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7504 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7505 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
7506 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
7507}
7508
7509
7510/*
7511 * PCMPGTB / VPCMPGTB
7512 */
7513#ifdef IEM_WITHOUT_ASSEMBLY
7514
7515IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7516{
7517 RT_NOREF(pFpuState);
7518 RTUINT64U uSrc1 = { *puDst };
7519 RTUINT64U uSrc2 = { *puSrc };
7520 RTUINT64U uDst;
7521 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
7522 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
7523 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
7524 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
7525 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
7526 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
7527 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
7528 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
7529 *puDst = uDst.u;
7530}
7531
7532
7533IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7534{
7535 RT_NOREF(pFpuState);
7536 RTUINT128U uSrc1 = *puDst;
7537 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
7538 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
7539 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
7540 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
7541 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
7542 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
7543 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
7544 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
7545 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
7546 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
7547 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
7548 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
7549 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
7550 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
7551 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
7552 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
7553}
7554
7555#endif
7556
7557IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7558 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7559{
7560 RT_NOREF(pExtState);
7561 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7562 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7563 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7564 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7565 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7566 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7567 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7568 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7569 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7570 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7571 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7572 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7573 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7574 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7575 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7576 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7577}
7578
7579IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7580 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7581{
7582 RT_NOREF(pExtState);
7583 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7584 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7585 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7586 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7587 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7588 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7589 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7590 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7591 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7592 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7593 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7594 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7595 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7596 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7597 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7598 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7599 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
7600 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
7601 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
7602 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
7603 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
7604 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
7605 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
7606 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
7607 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
7608 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
7609 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
7610 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
7611 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
7612 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
7613 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
7614 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
7615}
7616
7617
7618/*
7619 * PCMPGTW / VPCMPGTW
7620 */
7621#ifdef IEM_WITHOUT_ASSEMBLY
7622
7623IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7624{
7625 RT_NOREF(pFpuState);
7626 RTUINT64U uSrc1 = { *puDst };
7627 RTUINT64U uSrc2 = { *puSrc };
7628 RTUINT64U uDst;
7629 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
7630 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
7631 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
7632 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
7633 *puDst = uDst.u;
7634}
7635
7636
7637IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7638{
7639 RT_NOREF(pFpuState);
7640 RTUINT128U uSrc1 = *puDst;
7641 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
7642 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
7643 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
7644 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
7645 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
7646 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
7647 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
7648 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
7649}
7650
7651#endif
7652
7653IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7654 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7655{
7656 RT_NOREF(pExtState);
7657 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7658 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7659 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7660 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7661 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7662 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7663 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7664 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7665}
7666
7667IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7668 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7669{
7670 RT_NOREF(pExtState);
7671 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7672 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7673 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7674 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7675 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7676 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7677 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7678 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7679 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
7680 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
7681 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
7682 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
7683 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
7684 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
7685 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
7686 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
7687}
7688
7689
7690/*
7691 * PCMPGTD / VPCMPGTD.
7692 */
7693#ifdef IEM_WITHOUT_ASSEMBLY
7694
7695IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7696{
7697 RT_NOREF(pFpuState);
7698 RTUINT64U uSrc1 = { *puDst };
7699 RTUINT64U uSrc2 = { *puSrc };
7700 RTUINT64U uDst;
7701 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
7702 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
7703 *puDst = uDst.u;
7704}
7705
7706
7707IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7708{
7709 RT_NOREF(pFpuState);
7710 RTUINT128U uSrc1 = *puDst;
7711 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
7712 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
7713 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
7714 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
7715}
7716
7717#endif /* IEM_WITHOUT_ASSEMBLY */
7718
7719IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7720 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7721{
7722 RT_NOREF(pExtState);
7723 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7724 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7725 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7726 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7727}
7728
7729IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7730 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7731{
7732 RT_NOREF(pExtState);
7733 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7734 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7735 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7736 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7737 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
7738 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
7739 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
7740 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
7741}
7742
7743
7744/*
7745 * PCMPGTQ / VPCMPGTQ.
7746 */
7747IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7748{
7749 RT_NOREF(pFpuState);
7750 RTUINT128U uSrc1 = *puDst;
7751 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
7752 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
7753}
7754
7755IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7756 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7757{
7758 RT_NOREF(pExtState);
7759 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7760 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7761}
7762
7763IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7764 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7765{
7766 RT_NOREF(pExtState);
7767 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7768 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7769 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
7770 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
7771}
7772
7773
7774/*
7775 * PADDB / VPADDB
7776 */
7777#ifdef IEM_WITHOUT_ASSEMBLY
7778
7779IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7780{
7781 RT_NOREF(pFpuState);
7782 RTUINT64U uSrc1 = { *puDst };
7783 RTUINT64U uSrc2 = { *puSrc };
7784 RTUINT64U uDst;
7785 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
7786 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
7787 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
7788 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
7789 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
7790 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
7791 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
7792 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
7793 *puDst = uDst.u;
7794}
7795
7796
7797IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7798{
7799 RT_NOREF(pFpuState);
7800 RTUINT128U uSrc1 = *puDst;
7801 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
7802 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
7803 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
7804 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
7805 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
7806 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
7807 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
7808 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
7809 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
7810 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
7811 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
7812 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
7813 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
7814 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
7815 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
7816 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
7817}
7818
7819#endif
7820
7821
7822IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7823 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7824{
7825 RT_NOREF(pExtState);
7826 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7827 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7828 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7829 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7830 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7831 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7832 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7833 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7834 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7835 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7836 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7837 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7838 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7839 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7840 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7841 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7842}
7843
7844IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7845 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7846{
7847 RT_NOREF(pExtState);
7848 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7849 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7850 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7851 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7852 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7853 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7854 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7855 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7856 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7857 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7858 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7859 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7860 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7861 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7862 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7863 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7864 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
7865 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
7866 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
7867 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
7868 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
7869 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
7870 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
7871 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
7872 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
7873 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
7874 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
7875 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
7876 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
7877 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
7878 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
7879 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
7880}
7881
7882
7883/*
7884 * PADDSB / VPADDSB
7885 */
7886#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
7887 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
7888 ? (uint8_t)(a_iWord) \
7889 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
7890
7891#ifdef IEM_WITHOUT_ASSEMBLY
7892
7893IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7894{
7895 RT_NOREF(pFpuState);
7896 RTUINT64U uSrc1 = { *puDst };
7897 RTUINT64U uSrc2 = { *puSrc };
7898 RTUINT64U uDst;
7899 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
7900 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
7901 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
7902 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
7903 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
7904 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
7905 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
7906 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
7907 *puDst = uDst.u;
7908}
7909
7910
7911IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7912{
7913 RT_NOREF(pFpuState);
7914 RTUINT128U uSrc1 = *puDst;
7915 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
7916 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
7917 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
7918 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
7919 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
7920 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
7921 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
7922 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
7923 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
7924 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
7925 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
7926 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
7927 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
7928 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
7929 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
7930 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
7931}
7932
7933#endif
7934
7935
7936/*
7937 * PADDSB / VPADDSB
7938 */
7939#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
7940 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
7941 ? (uint8_t)(a_uWord) \
7942 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
7943
7944#ifdef IEM_WITHOUT_ASSEMBLY
7945
7946IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7947{
7948 RT_NOREF(pFpuState);
7949 RTUINT64U uSrc1 = { *puDst };
7950 RTUINT64U uSrc2 = { *puSrc };
7951 RTUINT64U uDst;
7952 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
7953 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
7954 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
7955 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
7956 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
7957 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
7958 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
7959 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
7960 *puDst = uDst.u;
7961}
7962
7963
7964IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7965{
7966 RT_NOREF(pFpuState);
7967 RTUINT128U uSrc1 = *puDst;
7968 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
7969 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
7970 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
7971 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
7972 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
7973 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
7974 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
7975 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
7976 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
7977 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
7978 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
7979 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
7980 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
7981 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
7982 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
7983 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
7984}
7985
7986#endif
7987
7988
7989/*
7990 * PADDW / VPADDW
7991 */
7992#ifdef IEM_WITHOUT_ASSEMBLY
7993
7994IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7995{
7996 RT_NOREF(pFpuState);
7997 RTUINT64U uSrc1 = { *puDst };
7998 RTUINT64U uSrc2 = { *puSrc };
7999 RTUINT64U uDst;
8000 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8001 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8002 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8003 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8004 *puDst = uDst.u;
8005}
8006
8007
8008IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8009{
8010 RT_NOREF(pFpuState);
8011 RTUINT128U uSrc1 = *puDst;
8012 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8013 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8014 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8015 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8016 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8017 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8018 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8019 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8020}
8021
8022#endif
8023
8024
8025IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8026 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8027{
8028 RT_NOREF(pExtState);
8029 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8030 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8031 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8032 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8033 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8034 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8035 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8036 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8037}
8038
8039IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8040 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8041{
8042 RT_NOREF(pExtState);
8043 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8044 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8045 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8046 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8047 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8048 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8049 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8050 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8051 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8052 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8053 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8054 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8055 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8056 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8057 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8058 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8059}
8060
8061
8062/*
8063 * PADDSW / VPADDSW
8064 */
8065#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8066 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8067 ? (uint16_t)(a_iDword) \
8068 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8069
8070#ifdef IEM_WITHOUT_ASSEMBLY
8071
8072IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8073{
8074 RT_NOREF(pFpuState);
8075 RTUINT64U uSrc1 = { *puDst };
8076 RTUINT64U uSrc2 = { *puSrc };
8077 RTUINT64U uDst;
8078 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8079 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8080 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8081 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8082 *puDst = uDst.u;
8083}
8084
8085
8086IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8087{
8088 RT_NOREF(pFpuState);
8089 RTUINT128U uSrc1 = *puDst;
8090 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8091 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8092 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8093 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8094 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8095 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8096 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8097 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8098}
8099
8100#endif
8101
8102
8103/*
8104 * PADDUSW / VPADDUSW
8105 */
8106#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8107 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8108 ? (uint16_t)(a_uDword) \
8109 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8110
8111#ifdef IEM_WITHOUT_ASSEMBLY
8112
8113IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8114{
8115 RT_NOREF(pFpuState);
8116 RTUINT64U uSrc1 = { *puDst };
8117 RTUINT64U uSrc2 = { *puSrc };
8118 RTUINT64U uDst;
8119 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8120 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8121 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8122 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8123 *puDst = uDst.u;
8124}
8125
8126
8127IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8128{
8129 RT_NOREF(pFpuState);
8130 RTUINT128U uSrc1 = *puDst;
8131 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8132 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8133 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8134 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8135 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8136 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8137 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8138 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8139}
8140
8141#endif
8142
8143
8144/*
8145 * PADDD / VPADDD.
8146 */
8147#ifdef IEM_WITHOUT_ASSEMBLY
8148
8149IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8150{
8151 RT_NOREF(pFpuState);
8152 RTUINT64U uSrc1 = { *puDst };
8153 RTUINT64U uSrc2 = { *puSrc };
8154 RTUINT64U uDst;
8155 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8156 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8157 *puDst = uDst.u;
8158}
8159
8160
8161IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8162{
8163 RT_NOREF(pFpuState);
8164 RTUINT128U uSrc1 = *puDst;
8165 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8166 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8167 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8168 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8169}
8170
8171#endif /* IEM_WITHOUT_ASSEMBLY */
8172
8173IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8174 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8175{
8176 RT_NOREF(pExtState);
8177 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8178 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8179 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8180 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8181}
8182
8183IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8184 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8185{
8186 RT_NOREF(pExtState);
8187 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8188 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8189 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8190 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8191 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8192 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8193 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8194 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8195}
8196
8197
8198/*
8199 * PADDQ / VPADDQ.
8200 */
8201#ifdef IEM_WITHOUT_ASSEMBLY
8202
8203IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8204{
8205 RT_NOREF(pFpuState);
8206 *puDst = *puDst + *puSrc;
8207}
8208
8209IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8210{
8211 RT_NOREF(pFpuState);
8212 RTUINT128U uSrc1 = *puDst;
8213 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8214 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8215}
8216
8217#endif
8218
8219IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8220 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8221{
8222 RT_NOREF(pExtState);
8223 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8224 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8225}
8226
8227IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8228 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8229{
8230 RT_NOREF(pExtState);
8231 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8232 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8233 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8234 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8235}
8236
8237
8238/*
8239 * PSUBB / VPSUBB
8240 */
8241#ifdef IEM_WITHOUT_ASSEMBLY
8242
8243IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8244{
8245 RT_NOREF(pFpuState);
8246 RTUINT64U uSrc1 = { *puDst };
8247 RTUINT64U uSrc2 = { *puSrc };
8248 RTUINT64U uDst;
8249 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8250 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8251 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8252 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8253 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8254 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8255 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8256 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8257 *puDst = uDst.u;
8258}
8259
8260
8261IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8262{
8263 RT_NOREF(pFpuState);
8264 RTUINT128U uSrc1 = *puDst;
8265 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8266 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8267 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8268 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8269 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8270 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8271 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8272 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8273 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8274 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8275 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8276 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8277 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8278 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8279 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8280 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8281}
8282
8283#endif
8284
8285IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8286 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8287{
8288 RT_NOREF(pExtState);
8289 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8290 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8291 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8292 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8293 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8294 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8295 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8296 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8297 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8298 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8299 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8300 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8301 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8302 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8303 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8304 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8305}
8306
8307IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8308 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8309{
8310 RT_NOREF(pExtState);
8311 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8312 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8313 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8314 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8315 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8316 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8317 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8318 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8319 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8320 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8321 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8322 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8323 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8324 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8325 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8326 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8327 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8328 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8329 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8330 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8331 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8332 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8333 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8334 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8335 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8336 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8337 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8338 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8339 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8340 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8341 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8342 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8343}
8344
8345
8346/*
8347 * PSUBSB / VSUBSB
8348 */
8349#ifdef IEM_WITHOUT_ASSEMBLY
8350
8351IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8352{
8353 RT_NOREF(pFpuState);
8354 RTUINT64U uSrc1 = { *puDst };
8355 RTUINT64U uSrc2 = { *puSrc };
8356 RTUINT64U uDst;
8357 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8358 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8359 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8360 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8361 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8362 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8363 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8364 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8365 *puDst = uDst.u;
8366}
8367
8368
8369IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8370{
8371 RT_NOREF(pFpuState);
8372 RTUINT128U uSrc1 = *puDst;
8373 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8374 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8375 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8376 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8377 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8378 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8379 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8380 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8381 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8382 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8383 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8384 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8385 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8386 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8387 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8388 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8389}
8390
8391#endif
8392
8393
8394/*
8395 * PADDSB / VPADDSB
8396 */
8397#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8398 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8399 ? (uint8_t)(a_uWord) \
8400 : (uint8_t)0 )
8401
8402#ifdef IEM_WITHOUT_ASSEMBLY
8403
8404IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8405{
8406 RT_NOREF(pFpuState);
8407 RTUINT64U uSrc1 = { *puDst };
8408 RTUINT64U uSrc2 = { *puSrc };
8409 RTUINT64U uDst;
8410 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
8411 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
8412 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
8413 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
8414 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
8415 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
8416 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
8417 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
8418 *puDst = uDst.u;
8419}
8420
8421
8422IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8423{
8424 RT_NOREF(pFpuState);
8425 RTUINT128U uSrc1 = *puDst;
8426 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
8427 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
8428 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
8429 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
8430 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
8431 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
8432 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
8433 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
8434 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
8435 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
8436 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
8437 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
8438 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
8439 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
8440 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
8441 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
8442}
8443
8444#endif
8445
8446
8447/*
8448 * PSUBW / VPSUBW
8449 */
8450#ifdef IEM_WITHOUT_ASSEMBLY
8451
8452IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8453{
8454 RT_NOREF(pFpuState);
8455 RTUINT64U uSrc1 = { *puDst };
8456 RTUINT64U uSrc2 = { *puSrc };
8457 RTUINT64U uDst;
8458 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
8459 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
8460 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
8461 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
8462 *puDst = uDst.u;
8463}
8464
8465
8466IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8467{
8468 RT_NOREF(pFpuState);
8469 RTUINT128U uSrc1 = *puDst;
8470 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
8471 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
8472 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
8473 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
8474 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
8475 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
8476 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
8477 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
8478}
8479
8480#endif
8481
8482IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8483 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8484{
8485 RT_NOREF(pExtState);
8486 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8487 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8488 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8489 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8490 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8491 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8492 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8493 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8494}
8495
8496IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8497 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8498{
8499 RT_NOREF(pExtState);
8500 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8501 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8502 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8503 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8504 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8505 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8506 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8507 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8508 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
8509 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
8510 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
8511 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
8512 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
8513 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
8514 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
8515 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
8516}
8517
8518
8519/*
8520 * PSUBSW / VPSUBSW
8521 */
8522#ifdef IEM_WITHOUT_ASSEMBLY
8523
8524IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8525{
8526 RT_NOREF(pFpuState);
8527 RTUINT64U uSrc1 = { *puDst };
8528 RTUINT64U uSrc2 = { *puSrc };
8529 RTUINT64U uDst;
8530 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
8531 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
8532 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
8533 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
8534 *puDst = uDst.u;
8535}
8536
8537
8538IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8539{
8540 RT_NOREF(pFpuState);
8541 RTUINT128U uSrc1 = *puDst;
8542 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
8543 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
8544 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
8545 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
8546 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
8547 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
8548 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
8549 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
8550}
8551
8552#endif
8553
8554
8555/*
8556 * PSUBUSW / VPSUBUSW
8557 */
8558#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
8559 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8560 ? (uint16_t)(a_uDword) \
8561 : (uint16_t)0 )
8562
8563#ifdef IEM_WITHOUT_ASSEMBLY
8564
8565IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8566{
8567 RT_NOREF(pFpuState);
8568 RTUINT64U uSrc1 = { *puDst };
8569 RTUINT64U uSrc2 = { *puSrc };
8570 RTUINT64U uDst;
8571 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
8572 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
8573 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
8574 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
8575 *puDst = uDst.u;
8576}
8577
8578
8579IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8580{
8581 RT_NOREF(pFpuState);
8582 RTUINT128U uSrc1 = *puDst;
8583 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
8584 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
8585 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
8586 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
8587 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
8588 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
8589 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
8590 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
8591}
8592
8593#endif
8594
8595
8596/*
8597 * PSUBD / VPSUBD.
8598 */
8599#ifdef IEM_WITHOUT_ASSEMBLY
8600
8601IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8602{
8603 RT_NOREF(pFpuState);
8604 RTUINT64U uSrc1 = { *puDst };
8605 RTUINT64U uSrc2 = { *puSrc };
8606 RTUINT64U uDst;
8607 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
8608 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
8609 *puDst = uDst.u;
8610}
8611
8612
8613IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8614{
8615 RT_NOREF(pFpuState);
8616 RTUINT128U uSrc1 = *puDst;
8617 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
8618 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
8619 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
8620 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
8621}
8622
8623#endif /* IEM_WITHOUT_ASSEMBLY */
8624
8625IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8626 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8627{
8628 RT_NOREF(pExtState);
8629 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8630 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8631 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8632 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8633}
8634
8635IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8636 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8637{
8638 RT_NOREF(pExtState);
8639 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8640 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8641 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8642 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8643 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
8644 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
8645 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
8646 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
8647}
8648
8649
8650/*
8651 * PSUBQ / VPSUBQ.
8652 */
8653#ifdef IEM_WITHOUT_ASSEMBLY
8654
8655IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8656{
8657 RT_NOREF(pFpuState);
8658 *puDst = *puDst - *puSrc;
8659}
8660
8661IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8662{
8663 RT_NOREF(pFpuState);
8664 RTUINT128U uSrc1 = *puDst;
8665 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
8666 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
8667}
8668
8669#endif
8670
8671IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8672 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8673{
8674 RT_NOREF(pExtState);
8675 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8676 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8677}
8678
8679IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8680 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8681{
8682 RT_NOREF(pExtState);
8683 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8684 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8685 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
8686 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
8687}
8688
8689
8690
8691/*
8692 * PMULLW / VPMULLW / PMULLD / VPMULLD
8693 */
8694#ifdef IEM_WITHOUT_ASSEMBLY
8695
8696IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8697{
8698 RT_NOREF(pFpuState);
8699 RTUINT64U uSrc1 = { *puDst };
8700 RTUINT64U uSrc2 = { *puSrc };
8701 RTUINT64U uDst;
8702 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
8703 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
8704 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
8705 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
8706 *puDst = uDst.u;
8707}
8708
8709
8710IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8711{
8712 RT_NOREF(pFpuState);
8713 RTUINT128U uSrc1 = *puDst;
8714 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
8715 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
8716 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
8717 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
8718 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
8719 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
8720 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
8721 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
8722}
8723
8724#endif
8725
8726IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8727{
8728 RTUINT128U uSrc1 = *puDst;
8729
8730 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
8731 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
8732 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
8733 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
8734 RT_NOREF(pFpuState);
8735}
8736
8737
8738IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8739{
8740 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
8741 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
8742 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
8743 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
8744 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
8745 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
8746 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
8747 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
8748}
8749
8750
8751IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8752{
8753 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
8754 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
8755 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
8756 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
8757 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
8758 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
8759 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
8760 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
8761 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
8762 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
8763 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
8764 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
8765 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
8766 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
8767 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
8768 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
8769}
8770
8771
8772IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8773{
8774 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
8775 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
8776 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
8777 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
8778}
8779
8780
8781IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8782{
8783 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
8784 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
8785 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
8786 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
8787 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
8788 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
8789 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
8790 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
8791}
8792
8793
8794/*
8795 * PMULHW / VPMULHW
8796 */
8797#ifdef IEM_WITHOUT_ASSEMBLY
8798
8799IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8800{
8801 RT_NOREF(pFpuState);
8802 RTUINT64U uSrc1 = { *puDst };
8803 RTUINT64U uSrc2 = { *puSrc };
8804 RTUINT64U uDst;
8805 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
8806 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
8807 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
8808 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
8809 *puDst = uDst.u;
8810}
8811
8812
8813IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8814{
8815 RT_NOREF(pFpuState);
8816 RTUINT128U uSrc1 = *puDst;
8817 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
8818 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
8819 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
8820 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
8821 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
8822 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
8823 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
8824 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
8825}
8826
8827#endif
8828
8829IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8830{
8831 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
8832 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
8833 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
8834 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
8835 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
8836 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
8837 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
8838 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
8839}
8840
8841
8842IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8843{
8844 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
8845 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
8846 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
8847 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
8848 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
8849 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
8850 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
8851 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
8852 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
8853 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
8854 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
8855 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
8856 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
8857 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
8858 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
8859 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
8860}
8861
8862
8863/*
8864 * PMULHUW / VPMULHUW
8865 */
8866#ifdef IEM_WITHOUT_ASSEMBLY
8867
8868IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8869{
8870 RTUINT64U uSrc1 = { *puDst };
8871 RTUINT64U uSrc2 = { *puSrc };
8872 RTUINT64U uDst;
8873 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
8874 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
8875 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
8876 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
8877 *puDst = uDst.u;
8878}
8879
8880
8881IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8882{
8883 RTUINT128U uSrc1 = *puDst;
8884 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
8885 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
8886 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
8887 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
8888 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
8889 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
8890 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
8891 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
8892}
8893
8894#endif
8895
8896IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8897{
8898 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
8899 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
8900 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
8901 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
8902 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
8903 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
8904 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
8905 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
8906}
8907
8908
8909IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8910{
8911 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
8912 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
8913 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
8914 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
8915 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
8916 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
8917 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
8918 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
8919 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
8920 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
8921 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
8922 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
8923 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
8924 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
8925 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
8926 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
8927}
8928
8929
8930/*
8931 * PSRLW / VPSRLW
8932 */
8933#ifdef IEM_WITHOUT_ASSEMBLY
8934
8935IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8936{
8937 RTUINT64U uSrc1 = { *puDst };
8938 RTUINT64U uSrc2 = { *puSrc };
8939 RTUINT64U uDst;
8940
8941 if (uSrc2.au64[0] <= 15)
8942 {
8943 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
8944 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
8945 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
8946 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
8947 }
8948 else
8949 {
8950 uDst.au64[0] = 0;
8951 }
8952 *puDst = uDst.u;
8953}
8954
8955
8956IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
8957{
8958 RTUINT64U uSrc1 = { *puDst };
8959 RTUINT64U uDst;
8960
8961 if (uShift <= 15)
8962 {
8963 uDst.au16[0] = uSrc1.au16[0] >> uShift;
8964 uDst.au16[1] = uSrc1.au16[1] >> uShift;
8965 uDst.au16[2] = uSrc1.au16[2] >> uShift;
8966 uDst.au16[3] = uSrc1.au16[3] >> uShift;
8967 }
8968 else
8969 {
8970 uDst.au64[0] = 0;
8971 }
8972 *puDst = uDst.u;
8973}
8974
8975
8976IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8977{
8978 RTUINT128U uSrc1 = *puDst;
8979
8980 if (puSrc->au64[0] <= 15)
8981 {
8982 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
8983 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
8984 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
8985 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
8986 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
8987 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
8988 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
8989 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
8990 }
8991 else
8992 {
8993 puDst->au64[0] = 0;
8994 puDst->au64[1] = 0;
8995 }
8996}
8997
8998IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
8999{
9000 RTUINT128U uSrc1 = *puDst;
9001
9002 if (uShift <= 15)
9003 {
9004 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9005 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9006 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9007 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9008 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9009 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9010 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9011 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9012 }
9013 else
9014 {
9015 puDst->au64[0] = 0;
9016 puDst->au64[1] = 0;
9017 }
9018}
9019
9020#endif
9021
9022
9023/*
9024 * PSRAW / VPSRAW
9025 */
9026#ifdef IEM_WITHOUT_ASSEMBLY
9027
9028IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9029{
9030 RTUINT64U uSrc1 = { *puDst };
9031 RTUINT64U uSrc2 = { *puSrc };
9032 RTUINT64U uDst;
9033
9034 if (uSrc2.au64[0] <= 15)
9035 {
9036 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9037 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9038 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9039 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9040 }
9041 else
9042 {
9043 uDst.au64[0] = 0;
9044 }
9045 *puDst = uDst.u;
9046}
9047
9048
9049IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9050{
9051 RTUINT64U uSrc1 = { *puDst };
9052 RTUINT64U uDst;
9053
9054 if (uShift <= 15)
9055 {
9056 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9057 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9058 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9059 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9060 }
9061 else
9062 {
9063 uDst.au64[0] = 0;
9064 }
9065 *puDst = uDst.u;
9066}
9067
9068
9069IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9070{
9071 RTUINT128U uSrc1 = *puDst;
9072
9073 if (puSrc->au64[0] <= 15)
9074 {
9075 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9076 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9077 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9078 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9079 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9080 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9081 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9082 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9083 }
9084 else
9085 {
9086 puDst->au64[0] = 0;
9087 puDst->au64[1] = 0;
9088 }
9089}
9090
9091IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9092{
9093 RTUINT128U uSrc1 = *puDst;
9094
9095 if (uShift <= 15)
9096 {
9097 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9098 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9099 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9100 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9101 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9102 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9103 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9104 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9105 }
9106 else
9107 {
9108 puDst->au64[0] = 0;
9109 puDst->au64[1] = 0;
9110 }
9111}
9112
9113#endif
9114
9115
9116/*
9117 * PSLLW / VPSLLW
9118 */
9119#ifdef IEM_WITHOUT_ASSEMBLY
9120
9121IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9122{
9123 RTUINT64U uSrc1 = { *puDst };
9124 RTUINT64U uSrc2 = { *puSrc };
9125 RTUINT64U uDst;
9126
9127 if (uSrc2.au64[0] <= 15)
9128 {
9129 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9130 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9131 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9132 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9133 }
9134 else
9135 {
9136 uDst.au64[0] = 0;
9137 }
9138 *puDst = uDst.u;
9139}
9140
9141
9142IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9143{
9144 RTUINT64U uSrc1 = { *puDst };
9145 RTUINT64U uDst;
9146
9147 if (uShift <= 15)
9148 {
9149 uDst.au16[0] = uSrc1.au16[0] << uShift;
9150 uDst.au16[1] = uSrc1.au16[1] << uShift;
9151 uDst.au16[2] = uSrc1.au16[2] << uShift;
9152 uDst.au16[3] = uSrc1.au16[3] << uShift;
9153 }
9154 else
9155 {
9156 uDst.au64[0] = 0;
9157 }
9158 *puDst = uDst.u;
9159}
9160
9161
9162IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9163{
9164 RTUINT128U uSrc1 = *puDst;
9165
9166 if (puSrc->au64[0] <= 15)
9167 {
9168 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9169 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9170 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9171 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9172 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9173 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9174 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9175 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9176 }
9177 else
9178 {
9179 puDst->au64[0] = 0;
9180 puDst->au64[1] = 0;
9181 }
9182}
9183
9184IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9185{
9186 RTUINT128U uSrc1 = *puDst;
9187
9188 if (uShift <= 15)
9189 {
9190 puDst->au16[0] = uSrc1.au16[0] << uShift;
9191 puDst->au16[1] = uSrc1.au16[1] << uShift;
9192 puDst->au16[2] = uSrc1.au16[2] << uShift;
9193 puDst->au16[3] = uSrc1.au16[3] << uShift;
9194 puDst->au16[4] = uSrc1.au16[4] << uShift;
9195 puDst->au16[5] = uSrc1.au16[5] << uShift;
9196 puDst->au16[6] = uSrc1.au16[6] << uShift;
9197 puDst->au16[7] = uSrc1.au16[7] << uShift;
9198 }
9199 else
9200 {
9201 puDst->au64[0] = 0;
9202 puDst->au64[1] = 0;
9203 }
9204}
9205
9206#endif
9207
9208
9209/*
9210 * PSRLD / VPSRLD
9211 */
9212#ifdef IEM_WITHOUT_ASSEMBLY
9213
9214IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9215{
9216 RTUINT64U uSrc1 = { *puDst };
9217 RTUINT64U uSrc2 = { *puSrc };
9218 RTUINT64U uDst;
9219
9220 if (uSrc2.au64[0] <= 31)
9221 {
9222 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9223 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9224 }
9225 else
9226 {
9227 uDst.au64[0] = 0;
9228 }
9229 *puDst = uDst.u;
9230}
9231
9232
9233IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9234{
9235 RTUINT64U uSrc1 = { *puDst };
9236 RTUINT64U uDst;
9237
9238 if (uShift <= 31)
9239 {
9240 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9241 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9242 }
9243 else
9244 {
9245 uDst.au64[0] = 0;
9246 }
9247 *puDst = uDst.u;
9248}
9249
9250
9251IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9252{
9253 RTUINT128U uSrc1 = *puDst;
9254
9255 if (puSrc->au64[0] <= 31)
9256 {
9257 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9258 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9259 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9260 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9261 }
9262 else
9263 {
9264 puDst->au64[0] = 0;
9265 puDst->au64[1] = 0;
9266 }
9267}
9268
9269IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9270{
9271 RTUINT128U uSrc1 = *puDst;
9272
9273 if (uShift <= 31)
9274 {
9275 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9276 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9277 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9278 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9279 }
9280 else
9281 {
9282 puDst->au64[0] = 0;
9283 puDst->au64[1] = 0;
9284 }
9285}
9286
9287#endif
9288
9289
9290/*
9291 * PSRAD / VPSRAD
9292 */
9293#ifdef IEM_WITHOUT_ASSEMBLY
9294
9295IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9296{
9297 RTUINT64U uSrc1 = { *puDst };
9298 RTUINT64U uSrc2 = { *puSrc };
9299 RTUINT64U uDst;
9300
9301 if (uSrc2.au64[0] <= 31)
9302 {
9303 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9304 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9305 }
9306 else
9307 {
9308 uDst.au64[0] = 0;
9309 }
9310 *puDst = uDst.u;
9311}
9312
9313
9314IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9315{
9316 RTUINT64U uSrc1 = { *puDst };
9317 RTUINT64U uDst;
9318
9319 if (uShift <= 31)
9320 {
9321 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9322 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9323 }
9324 else
9325 {
9326 uDst.au64[0] = 0;
9327 }
9328 *puDst = uDst.u;
9329}
9330
9331
9332IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9333{
9334 RTUINT128U uSrc1 = *puDst;
9335
9336 if (puSrc->au64[0] <= 31)
9337 {
9338 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9339 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9340 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9341 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9342 }
9343 else
9344 {
9345 puDst->au64[0] = 0;
9346 puDst->au64[1] = 0;
9347 }
9348}
9349
9350IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9351{
9352 RTUINT128U uSrc1 = *puDst;
9353
9354 if (uShift <= 31)
9355 {
9356 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9357 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9358 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9359 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9360 }
9361 else
9362 {
9363 puDst->au64[0] = 0;
9364 puDst->au64[1] = 0;
9365 }
9366}
9367
9368#endif
9369
9370
9371/*
9372 * PSLLD / VPSLLD
9373 */
9374#ifdef IEM_WITHOUT_ASSEMBLY
9375
9376IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9377{
9378 RTUINT64U uSrc1 = { *puDst };
9379 RTUINT64U uSrc2 = { *puSrc };
9380 RTUINT64U uDst;
9381
9382 if (uSrc2.au64[0] <= 31)
9383 {
9384 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9385 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9386 }
9387 else
9388 {
9389 uDst.au64[0] = 0;
9390 }
9391 *puDst = uDst.u;
9392}
9393
9394
9395IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9396{
9397 RTUINT64U uSrc1 = { *puDst };
9398 RTUINT64U uDst;
9399
9400 if (uShift <= 31)
9401 {
9402 uDst.au32[0] = uSrc1.au32[0] << uShift;
9403 uDst.au32[1] = uSrc1.au32[1] << uShift;
9404 }
9405 else
9406 {
9407 uDst.au64[0] = 0;
9408 }
9409 *puDst = uDst.u;
9410}
9411
9412
9413IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9414{
9415 RTUINT128U uSrc1 = *puDst;
9416
9417 if (puSrc->au64[0] <= 31)
9418 {
9419 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
9420 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
9421 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
9422 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
9423 }
9424 else
9425 {
9426 puDst->au64[0] = 0;
9427 puDst->au64[1] = 0;
9428 }
9429}
9430
9431IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9432{
9433 RTUINT128U uSrc1 = *puDst;
9434
9435 if (uShift <= 31)
9436 {
9437 puDst->au32[0] = uSrc1.au32[0] << uShift;
9438 puDst->au32[1] = uSrc1.au32[1] << uShift;
9439 puDst->au32[2] = uSrc1.au32[2] << uShift;
9440 puDst->au32[3] = uSrc1.au32[3] << uShift;
9441 }
9442 else
9443 {
9444 puDst->au64[0] = 0;
9445 puDst->au64[1] = 0;
9446 }
9447}
9448
9449#endif
9450
9451
9452/*
9453 * PSRLQ / VPSRLQ
9454 */
9455#ifdef IEM_WITHOUT_ASSEMBLY
9456
9457IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9458{
9459 RTUINT64U uSrc1 = { *puDst };
9460 RTUINT64U uSrc2 = { *puSrc };
9461 RTUINT64U uDst;
9462
9463 if (uSrc2.au64[0] <= 63)
9464 {
9465 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
9466 }
9467 else
9468 {
9469 uDst.au64[0] = 0;
9470 }
9471 *puDst = uDst.u;
9472}
9473
9474
9475IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9476{
9477 RTUINT64U uSrc1 = { *puDst };
9478 RTUINT64U uDst;
9479
9480 if (uShift <= 63)
9481 {
9482 uDst.au64[0] = uSrc1.au64[0] >> uShift;
9483 }
9484 else
9485 {
9486 uDst.au64[0] = 0;
9487 }
9488 *puDst = uDst.u;
9489}
9490
9491
9492IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9493{
9494 RTUINT128U uSrc1 = *puDst;
9495
9496 if (puSrc->au64[0] <= 63)
9497 {
9498 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
9499 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
9500 }
9501 else
9502 {
9503 puDst->au64[0] = 0;
9504 puDst->au64[1] = 0;
9505 }
9506}
9507
9508IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9509{
9510 RTUINT128U uSrc1 = *puDst;
9511
9512 if (uShift <= 63)
9513 {
9514 puDst->au64[0] = uSrc1.au64[0] >> uShift;
9515 puDst->au64[1] = uSrc1.au64[1] >> uShift;
9516 }
9517 else
9518 {
9519 puDst->au64[0] = 0;
9520 puDst->au64[1] = 0;
9521 }
9522}
9523
9524#endif
9525
9526
9527/*
9528 * PSLLQ / VPSLLQ
9529 */
9530#ifdef IEM_WITHOUT_ASSEMBLY
9531
9532IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9533{
9534 RTUINT64U uSrc1 = { *puDst };
9535 RTUINT64U uSrc2 = { *puSrc };
9536 RTUINT64U uDst;
9537
9538 if (uSrc2.au64[0] <= 63)
9539 {
9540 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
9541 }
9542 else
9543 {
9544 uDst.au64[0] = 0;
9545 }
9546 *puDst = uDst.u;
9547}
9548
9549
9550IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9551{
9552 RTUINT64U uSrc1 = { *puDst };
9553 RTUINT64U uDst;
9554
9555 if (uShift <= 63)
9556 {
9557 uDst.au64[0] = uSrc1.au64[0] << uShift;
9558 }
9559 else
9560 {
9561 uDst.au64[0] = 0;
9562 }
9563 *puDst = uDst.u;
9564}
9565
9566
9567IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9568{
9569 RTUINT128U uSrc1 = *puDst;
9570
9571 if (puSrc->au64[0] <= 63)
9572 {
9573 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
9574 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
9575 }
9576 else
9577 {
9578 puDst->au64[0] = 0;
9579 puDst->au64[1] = 0;
9580 }
9581}
9582
9583IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9584{
9585 RTUINT128U uSrc1 = *puDst;
9586
9587 if (uShift <= 63)
9588 {
9589 puDst->au64[0] = uSrc1.au64[0] << uShift;
9590 puDst->au64[1] = uSrc1.au64[1] << uShift;
9591 }
9592 else
9593 {
9594 puDst->au64[0] = 0;
9595 puDst->au64[1] = 0;
9596 }
9597}
9598
9599#endif
9600
9601
9602/*
9603 * PSRLDQ / VPSRLDQ
9604 */
9605#ifdef IEM_WITHOUT_ASSEMBLY
9606
9607IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9608{
9609 RTUINT128U uSrc1 = *puDst;
9610
9611 if (uShift < 16)
9612 {
9613 int i;
9614
9615 for (i = 0; i < 16 - uShift; ++i)
9616 puDst->au8[i] = uSrc1.au8[i + uShift];
9617 for (i = 16 - uShift; i < 16; ++i)
9618 puDst->au8[i] = 0;
9619 }
9620 else
9621 {
9622 puDst->au64[0] = 0;
9623 puDst->au64[1] = 0;
9624 }
9625}
9626
9627#endif
9628
9629
9630/*
9631 * PSLLDQ / VPSLLDQ
9632 */
9633#ifdef IEM_WITHOUT_ASSEMBLY
9634
9635IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9636{
9637 RTUINT128U uSrc1 = *puDst;
9638
9639 if (uShift < 16)
9640 {
9641 int i;
9642
9643 for (i = 0; i < uShift; ++i)
9644 puDst->au8[i] = 0;
9645 for (i = uShift; i < 16; ++i)
9646 puDst->au8[i] = uSrc1.au8[i - uShift];
9647 }
9648 else
9649 {
9650 puDst->au64[0] = 0;
9651 puDst->au64[1] = 0;
9652 }
9653}
9654
9655#endif
9656
9657
9658/*
9659 * PMADDWD / VPMADDWD
9660 */
9661#ifdef IEM_WITHOUT_ASSEMBLY
9662
9663IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9664{
9665 RTUINT64U uSrc1 = { *puDst };
9666 RTUINT64U uSrc2 = { *puSrc };
9667 RTUINT64U uDst;
9668
9669 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
9670 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
9671 *puDst = uDst.u;
9672 RT_NOREF(pFpuState);
9673}
9674
9675
9676IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9677{
9678 RTUINT128U uSrc1 = *puDst;
9679
9680 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
9681 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
9682 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
9683 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
9684 RT_NOREF(pFpuState);
9685}
9686
9687#endif
9688
9689
9690/*
9691 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
9692 */
9693#ifdef IEM_WITHOUT_ASSEMBLY
9694
9695IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9696{
9697 RTUINT64U uSrc1 = { *puDst };
9698 RTUINT64U uSrc2 = { *puSrc };
9699 RTUINT64U uDst;
9700
9701 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
9702 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
9703 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
9704 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
9705 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
9706 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
9707 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
9708 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
9709 *puDst = uDst.u;
9710 RT_NOREF(pFpuState);
9711}
9712
9713
9714IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9715{
9716 RTUINT128U uSrc1 = *puDst;
9717
9718 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
9719 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
9720 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
9721 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
9722 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
9723 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
9724 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
9725 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
9726 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
9727 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
9728 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
9729 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
9730 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
9731 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
9732 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
9733 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
9734 RT_NOREF(pFpuState);
9735}
9736
9737#endif
9738
9739
9740IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9741{
9742 RTUINT128U uSrc1 = *puDst;
9743
9744 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
9745 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
9746 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
9747 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
9748 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
9749 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
9750 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
9751 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
9752 RT_NOREF(pFpuState);
9753}
9754
9755
9756IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9757{
9758 RTUINT128U uSrc1 = *puDst;
9759
9760 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
9761 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
9762 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
9763 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
9764 RT_NOREF(pFpuState);
9765}
9766
9767
9768IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9769 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9770{
9771 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
9772 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
9773 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
9774 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
9775 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
9776 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
9777 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
9778 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
9779 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
9780 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
9781 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
9782 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
9783 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
9784 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
9785 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
9786 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
9787 RT_NOREF(pExtState);
9788}
9789
9790
9791IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9792 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9793{
9794 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
9795 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
9796 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
9797 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
9798 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
9799 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
9800 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
9801 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
9802 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
9803 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
9804 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
9805 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
9806 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
9807 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
9808 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
9809 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
9810 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
9811 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
9812 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
9813 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
9814 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
9815 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
9816 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
9817 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
9818 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
9819 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
9820 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
9821 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
9822 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
9823 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
9824 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
9825 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
9826 RT_NOREF(pExtState);
9827}
9828
9829
9830IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9831 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9832{
9833 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
9834 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
9835 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
9836 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
9837 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
9838 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
9839 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
9840 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
9841 RT_NOREF(pExtState);
9842}
9843
9844
9845IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9846 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9847{
9848 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
9849 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
9850 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
9851 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
9852 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
9853 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
9854 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
9855 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
9856 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
9857 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
9858 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
9859 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
9860 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
9861 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
9862 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
9863 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
9864 RT_NOREF(pExtState);
9865}
9866
9867
9868IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9869 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9870{
9871 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
9872 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
9873 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
9874 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
9875 RT_NOREF(pExtState);
9876}
9877
9878
9879IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9880 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9881{
9882 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
9883 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
9884 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
9885 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
9886 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
9887 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
9888 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
9889 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
9890 RT_NOREF(pExtState);
9891}
9892
9893
9894/*
9895 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
9896 */
9897#ifdef IEM_WITHOUT_ASSEMBLY
9898
9899IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9900{
9901 RTUINT64U uSrc1 = { *puDst };
9902 RTUINT64U uSrc2 = { *puSrc };
9903 RTUINT64U uDst;
9904
9905 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
9906 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
9907 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
9908 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
9909 *puDst = uDst.u;
9910 RT_NOREF(pFpuState);
9911}
9912
9913
9914IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9915{
9916 RTUINT128U uSrc1 = *puDst;
9917
9918 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
9919 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
9920 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
9921 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
9922 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
9923 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
9924 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
9925 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
9926 RT_NOREF(pFpuState);
9927}
9928
9929#endif
9930
9931IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9932{
9933 RTUINT128U uSrc1 = *puDst;
9934
9935 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
9936 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
9937 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
9938 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
9939 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
9940 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
9941 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
9942 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
9943 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
9944 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
9945 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
9946 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
9947 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
9948 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
9949 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
9950 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
9951 RT_NOREF(pFpuState);
9952}
9953
9954
9955IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9956{
9957 RTUINT128U uSrc1 = *puDst;
9958
9959 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
9960 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
9961 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
9962 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
9963 RT_NOREF(pFpuState);
9964}
9965
9966
9967IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9968 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9969{
9970 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
9971 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
9972 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
9973 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
9974 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
9975 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
9976 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
9977 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
9978 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
9979 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
9980 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
9981 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
9982 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
9983 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
9984 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
9985 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
9986 RT_NOREF(pExtState);
9987}
9988
9989
9990IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9991 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9992{
9993 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
9994 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
9995 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
9996 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
9997 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
9998 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
9999 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10000 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10001 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10002 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10003 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10004 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10005 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10006 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10007 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10008 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10009 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10010 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10011 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10012 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10013 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10014 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10015 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10016 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10017 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10018 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10019 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10020 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10021 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10022 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10023 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10024 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10025 RT_NOREF(pExtState);
10026}
10027
10028
10029IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10030 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10031{
10032 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10033 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10034 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10035 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10036 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10037 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10038 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10039 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10040 RT_NOREF(pExtState);
10041}
10042
10043
10044IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10045 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10046{
10047 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10048 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10049 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10050 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10051 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10052 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10053 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10054 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10055 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10056 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10057 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10058 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10059 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10060 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10061 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10062 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10063 RT_NOREF(pExtState);
10064}
10065
10066
10067IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10068 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10069{
10070 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10071 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10072 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10073 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10074 RT_NOREF(pExtState);
10075}
10076
10077
10078IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10079 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10080{
10081 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10082 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10083 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10084 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10085 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10086 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10087 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10088 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10089 RT_NOREF(pExtState);
10090}
10091
10092
10093/*
10094 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10095 */
10096#ifdef IEM_WITHOUT_ASSEMBLY
10097
10098IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10099{
10100 RTUINT64U uSrc1 = { *puDst };
10101 RTUINT64U uSrc2 = { *puSrc };
10102 RTUINT64U uDst;
10103
10104 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10105 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10106 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10107 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10108 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10109 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10110 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10111 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10112 *puDst = uDst.u;
10113 RT_NOREF(pFpuState);
10114}
10115
10116
10117IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10118{
10119 RTUINT128U uSrc1 = *puDst;
10120
10121 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10122 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10123 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10124 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10125 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10126 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10127 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10128 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10129 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10130 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10131 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10132 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10133 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10134 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10135 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10136 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10137 RT_NOREF(pFpuState);
10138}
10139
10140#endif
10141
10142IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10143{
10144 RTUINT128U uSrc1 = *puDst;
10145
10146 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10147 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10148 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10149 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10150 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10151 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10152 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10153 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10154 RT_NOREF(pFpuState);
10155}
10156
10157
10158IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10159{
10160 RTUINT128U uSrc1 = *puDst;
10161
10162 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10163 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10164 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10165 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10166 RT_NOREF(pFpuState);
10167}
10168
10169
10170IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10171 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10172{
10173 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10174 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10175 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10176 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10177 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10178 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10179 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10180 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10181 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10182 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10183 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10184 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10185 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10186 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10187 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10188 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10189 RT_NOREF(pExtState);
10190}
10191
10192
10193IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10194 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10195{
10196 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10197 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10198 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10199 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10200 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10201 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10202 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10203 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10204 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10205 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10206 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10207 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10208 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10209 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10210 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10211 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10212 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10213 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10214 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10215 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10216 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10217 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10218 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10219 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10220 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10221 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10222 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10223 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10224 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10225 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10226 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10227 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10228 RT_NOREF(pExtState);
10229}
10230
10231
10232IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10233 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10234{
10235 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10236 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10237 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10238 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10239 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10240 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10241 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10242 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10243 RT_NOREF(pExtState);
10244}
10245
10246
10247IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10248 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10249{
10250 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10251 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10252 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10253 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10254 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10255 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10256 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10257 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10258 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10259 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10260 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10261 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10262 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10263 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10264 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10265 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10266 RT_NOREF(pExtState);
10267}
10268
10269
10270IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10271 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10272{
10273 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10274 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10275 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10276 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10277 RT_NOREF(pExtState);
10278}
10279
10280
10281IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10282 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10283{
10284 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10285 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10286 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10287 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10288 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10289 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10290 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10291 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10292 RT_NOREF(pExtState);
10293}
10294
10295
10296/*
10297 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10298 */
10299#ifdef IEM_WITHOUT_ASSEMBLY
10300
10301IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10302{
10303 RTUINT64U uSrc1 = { *puDst };
10304 RTUINT64U uSrc2 = { *puSrc };
10305 RTUINT64U uDst;
10306
10307 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10308 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10309 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10310 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10311 *puDst = uDst.u;
10312 RT_NOREF(pFpuState);
10313}
10314
10315
10316IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10317{
10318 RTUINT128U uSrc1 = *puDst;
10319
10320 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10321 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10322 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10323 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10324 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10325 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10326 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10327 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10328 RT_NOREF(pFpuState);
10329}
10330
10331#endif
10332
10333IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10334{
10335 RTUINT128U uSrc1 = *puDst;
10336
10337 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10338 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10339 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10340 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10341 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10342 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10343 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10344 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10345 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10346 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10347 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10348 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10349 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10350 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10351 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10352 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10353 RT_NOREF(pFpuState);
10354}
10355
10356
10357IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10358{
10359 RTUINT128U uSrc1 = *puDst;
10360
10361 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10362 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10363 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10364 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10365 RT_NOREF(pFpuState);
10366}
10367
10368
10369IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10370 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10371{
10372 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10373 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10374 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10375 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10376 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10377 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10378 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10379 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10380 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10381 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10382 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10383 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10384 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10385 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10386 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10387 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10388 RT_NOREF(pExtState);
10389}
10390
10391
10392IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10393 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10394{
10395 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10396 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10397 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10398 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10399 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10400 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10401 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10402 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10403 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10404 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10405 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10406 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10407 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10408 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10409 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10410 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10411 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
10412 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
10413 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
10414 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
10415 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
10416 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
10417 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
10418 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
10419 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
10420 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
10421 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
10422 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
10423 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
10424 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
10425 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
10426 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
10427 RT_NOREF(pExtState);
10428}
10429
10430
10431IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10432 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10433{
10434 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10435 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10436 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10437 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10438 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10439 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10440 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10441 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10442 RT_NOREF(pExtState);
10443}
10444
10445
10446IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10447 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10448{
10449 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10450 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10451 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10452 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10453 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10454 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10455 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10456 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10457 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10458 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10459 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
10460 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
10461 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
10462 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
10463 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
10464 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
10465 RT_NOREF(pExtState);
10466}
10467
10468
10469IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10470 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10471{
10472 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10473 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10474 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10475 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10476 RT_NOREF(pExtState);
10477}
10478
10479
10480IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10481 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10482{
10483 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10484 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10485 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10486 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10487 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10488 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10489 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10490 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10491 RT_NOREF(pExtState);
10492}
10493
10494
10495/*
10496 * PAVGB / VPAVGB / PAVGW / VPAVGW
10497 */
10498#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
10499#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
10500
10501#ifdef IEM_WITHOUT_ASSEMBLY
10502
10503IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
10504{
10505 RTUINT64U uSrc1 = { *puDst };
10506 RTUINT64U uSrc2 = { *puSrc };
10507 RTUINT64U uDst;
10508
10509 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
10510 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
10511 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
10512 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
10513 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
10514 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
10515 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
10516 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
10517 *puDst = uDst.u;
10518}
10519
10520
10521IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10522{
10523 RTUINT128U uSrc1 = *puDst;
10524
10525 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
10526 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
10527 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
10528 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
10529 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
10530 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
10531 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
10532 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
10533 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
10534 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
10535 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
10536 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
10537 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
10538 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
10539 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
10540 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
10541}
10542
10543
10544IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10545{
10546 RTUINT64U uSrc1 = { *puDst };
10547 RTUINT64U uSrc2 = { *puSrc };
10548 RTUINT64U uDst;
10549
10550 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
10551 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
10552 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
10553 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
10554 *puDst = uDst.u;
10555}
10556
10557
10558IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10559{
10560 RTUINT128U uSrc1 = *puDst;
10561
10562 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
10563 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
10564 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
10565 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
10566 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
10567 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
10568 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
10569 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
10570}
10571
10572#endif
10573
10574IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10575{
10576 RTUINT128U uSrc1 = *puDst;
10577
10578 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
10579 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
10580 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
10581 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
10582 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
10583 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
10584 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
10585 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
10586 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
10587 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
10588 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
10589 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
10590 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
10591 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
10592 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
10593 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
10594}
10595
10596
10597IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10598{
10599 RTUINT128U uSrc1 = *puDst;
10600
10601 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
10602 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
10603 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
10604 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
10605 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
10606 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
10607 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
10608 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
10609 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
10610 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
10611 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
10612 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
10613 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
10614 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
10615 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
10616 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
10617}
10618
10619
10620IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10621{
10622 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10623 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10624 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10625 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10626 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10627 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10628 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10629 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10630 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10631 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10632 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
10633 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
10634 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
10635 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
10636 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
10637 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
10638}
10639
10640
10641IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10642{
10643 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10644 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10645 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10646 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10647 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10648 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10649 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10650 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10651 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10652 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10653 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
10654 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
10655 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
10656 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
10657 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
10658 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
10659 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
10660 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
10661 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
10662 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
10663 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
10664 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
10665 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
10666 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
10667 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
10668 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
10669 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
10670 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
10671 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
10672 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
10673 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
10674 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
10675}
10676
10677
10678IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10679{
10680 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10681 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10682 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10683 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10684 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10685 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10686 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10687 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10688}
10689
10690
10691IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10692{
10693 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10694 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10695 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10696 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10697 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10698 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10699 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10700 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10701 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10702 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10703 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
10704 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
10705 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
10706 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
10707 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
10708 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
10709}
10710
10711#undef PAVGB_EXEC
10712#undef PAVGW_EXEC
10713
10714
10715/*
10716 * PMOVMSKB / VPMOVMSKB
10717 */
10718#ifdef IEM_WITHOUT_ASSEMBLY
10719
10720IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
10721{
10722 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10723 uint64_t const uSrc = *pu64Src;
10724 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
10725 | ((uSrc >> (15-1)) & RT_BIT_64(1))
10726 | ((uSrc >> (23-2)) & RT_BIT_64(2))
10727 | ((uSrc >> (31-3)) & RT_BIT_64(3))
10728 | ((uSrc >> (39-4)) & RT_BIT_64(4))
10729 | ((uSrc >> (47-5)) & RT_BIT_64(5))
10730 | ((uSrc >> (55-6)) & RT_BIT_64(6))
10731 | ((uSrc >> (63-7)) & RT_BIT_64(7));
10732}
10733
10734
10735IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
10736{
10737 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10738 uint64_t const uSrc0 = pu128Src->QWords.qw0;
10739 uint64_t const uSrc1 = pu128Src->QWords.qw1;
10740 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
10741 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
10742 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
10743 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
10744 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
10745 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
10746 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
10747 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
10748 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
10749 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
10750 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
10751 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
10752 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
10753 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
10754 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
10755 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
10756}
10757
10758#endif
10759
10760IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
10761{
10762 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10763 uint64_t const uSrc0 = puSrc->QWords.qw0;
10764 uint64_t const uSrc1 = puSrc->QWords.qw1;
10765 uint64_t const uSrc2 = puSrc->QWords.qw2;
10766 uint64_t const uSrc3 = puSrc->QWords.qw3;
10767 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
10768 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
10769 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
10770 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
10771 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
10772 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
10773 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
10774 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
10775 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
10776 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
10777 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
10778 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
10779 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
10780 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
10781 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
10782 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
10783 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
10784 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
10785 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
10786 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
10787 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
10788 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
10789 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
10790 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
10791 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
10792 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
10793 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
10794 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
10795 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
10796 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
10797 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
10798 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
10799}
10800
10801
10802/*
10803 * [V]PSHUFB
10804 */
10805
10806IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10807{
10808 RTUINT64U const uSrc = { *puSrc };
10809 RTUINT64U const uDstIn = { *puDst };
10810 ASMCompilerBarrier();
10811 RTUINT64U uDstOut = { 0 };
10812 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
10813 {
10814 uint8_t idxSrc = uSrc.au8[iByte];
10815 if (!(idxSrc & 0x80))
10816 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
10817 }
10818 *puDst = uDstOut.u;
10819 RT_NOREF(pFpuState);
10820}
10821
10822
10823IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10824{
10825 RTUINT128U const uSrc = *puSrc;
10826 RTUINT128U const uDstIn = *puDst;
10827 ASMCompilerBarrier();
10828 puDst->au64[0] = 0;
10829 puDst->au64[1] = 0;
10830 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
10831 {
10832 uint8_t idxSrc = uSrc.au8[iByte];
10833 if (!(idxSrc & 0x80))
10834 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
10835 }
10836 RT_NOREF(pFpuState);
10837}
10838
10839
10840IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10841 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10842{
10843 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
10844 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
10845 ASMCompilerBarrier();
10846 puDst->au64[0] = 0;
10847 puDst->au64[1] = 0;
10848 for (unsigned iByte = 0; iByte < 16; iByte++)
10849 {
10850 uint8_t idxSrc = uSrc2.au8[iByte];
10851 if (!(idxSrc & 0x80))
10852 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
10853 }
10854 RT_NOREF(pExtState);
10855}
10856
10857
10858IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10859 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10860{
10861 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
10862 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
10863 ASMCompilerBarrier();
10864 puDst->au64[0] = 0;
10865 puDst->au64[1] = 0;
10866 puDst->au64[2] = 0;
10867 puDst->au64[3] = 0;
10868 for (unsigned iByte = 0; iByte < 16; iByte++)
10869 {
10870 uint8_t idxSrc = uSrc2.au8[iByte];
10871 if (!(idxSrc & 0x80))
10872 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
10873 }
10874 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
10875 {
10876 uint8_t idxSrc = uSrc2.au8[iByte];
10877 if (!(idxSrc & 0x80))
10878 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
10879 }
10880 RT_NOREF(pExtState);
10881}
10882
10883
10884/*
10885 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
10886 */
10887#ifdef IEM_WITHOUT_ASSEMBLY
10888
10889IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
10890{
10891 uint64_t const uSrc = *puSrc;
10892 ASMCompilerBarrier();
10893 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10894 uSrc >> (((bEvil >> 2) & 3) * 16),
10895 uSrc >> (((bEvil >> 4) & 3) * 16),
10896 uSrc >> (((bEvil >> 6) & 3) * 16));
10897}
10898
10899
10900IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10901{
10902 puDst->QWords.qw0 = puSrc->QWords.qw0;
10903 uint64_t const uSrc = puSrc->QWords.qw1;
10904 ASMCompilerBarrier();
10905 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10906 uSrc >> (((bEvil >> 2) & 3) * 16),
10907 uSrc >> (((bEvil >> 4) & 3) * 16),
10908 uSrc >> (((bEvil >> 6) & 3) * 16));
10909}
10910
10911#endif
10912
10913IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10914{
10915 puDst->QWords.qw0 = puSrc->QWords.qw0;
10916 uint64_t const uSrc1 = puSrc->QWords.qw1;
10917 puDst->QWords.qw2 = puSrc->QWords.qw2;
10918 uint64_t const uSrc3 = puSrc->QWords.qw3;
10919 ASMCompilerBarrier();
10920 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
10921 uSrc1 >> (((bEvil >> 2) & 3) * 16),
10922 uSrc1 >> (((bEvil >> 4) & 3) * 16),
10923 uSrc1 >> (((bEvil >> 6) & 3) * 16));
10924 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
10925 uSrc3 >> (((bEvil >> 2) & 3) * 16),
10926 uSrc3 >> (((bEvil >> 4) & 3) * 16),
10927 uSrc3 >> (((bEvil >> 6) & 3) * 16));
10928}
10929
10930#ifdef IEM_WITHOUT_ASSEMBLY
10931IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10932{
10933 puDst->QWords.qw1 = puSrc->QWords.qw1;
10934 uint64_t const uSrc = puSrc->QWords.qw0;
10935 ASMCompilerBarrier();
10936 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10937 uSrc >> (((bEvil >> 2) & 3) * 16),
10938 uSrc >> (((bEvil >> 4) & 3) * 16),
10939 uSrc >> (((bEvil >> 6) & 3) * 16));
10940
10941}
10942#endif
10943
10944
10945IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10946{
10947 puDst->QWords.qw3 = puSrc->QWords.qw3;
10948 uint64_t const uSrc2 = puSrc->QWords.qw2;
10949 puDst->QWords.qw1 = puSrc->QWords.qw1;
10950 uint64_t const uSrc0 = puSrc->QWords.qw0;
10951 ASMCompilerBarrier();
10952 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
10953 uSrc0 >> (((bEvil >> 2) & 3) * 16),
10954 uSrc0 >> (((bEvil >> 4) & 3) * 16),
10955 uSrc0 >> (((bEvil >> 6) & 3) * 16));
10956 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
10957 uSrc2 >> (((bEvil >> 2) & 3) * 16),
10958 uSrc2 >> (((bEvil >> 4) & 3) * 16),
10959 uSrc2 >> (((bEvil >> 6) & 3) * 16));
10960
10961}
10962
10963
10964#ifdef IEM_WITHOUT_ASSEMBLY
10965IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10966{
10967 RTUINT128U const uSrc = *puSrc;
10968 ASMCompilerBarrier();
10969 puDst->au32[0] = uSrc.au32[bEvil & 3];
10970 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
10971 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
10972 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
10973}
10974#endif
10975
10976
10977IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10978{
10979 RTUINT256U const uSrc = *puSrc;
10980 ASMCompilerBarrier();
10981 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
10982 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
10983 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
10984 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
10985 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
10986 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
10987 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
10988 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
10989}
10990
10991
10992/*
10993 * PUNPCKHBW - high bytes -> words
10994 */
10995#ifdef IEM_WITHOUT_ASSEMBLY
10996
10997IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10998{
10999 RTUINT64U const uSrc2 = { *puSrc };
11000 RTUINT64U const uSrc1 = { *puDst };
11001 ASMCompilerBarrier();
11002 RTUINT64U uDstOut;
11003 uDstOut.au8[0] = uSrc1.au8[4];
11004 uDstOut.au8[1] = uSrc2.au8[4];
11005 uDstOut.au8[2] = uSrc1.au8[5];
11006 uDstOut.au8[3] = uSrc2.au8[5];
11007 uDstOut.au8[4] = uSrc1.au8[6];
11008 uDstOut.au8[5] = uSrc2.au8[6];
11009 uDstOut.au8[6] = uSrc1.au8[7];
11010 uDstOut.au8[7] = uSrc2.au8[7];
11011 *puDst = uDstOut.u;
11012}
11013
11014
11015IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11016{
11017 RTUINT128U const uSrc2 = *puSrc;
11018 RTUINT128U const uSrc1 = *puDst;
11019 ASMCompilerBarrier();
11020 RTUINT128U uDstOut;
11021 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11022 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11023 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11024 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11025 uDstOut.au8[ 4] = uSrc1.au8[10];
11026 uDstOut.au8[ 5] = uSrc2.au8[10];
11027 uDstOut.au8[ 6] = uSrc1.au8[11];
11028 uDstOut.au8[ 7] = uSrc2.au8[11];
11029 uDstOut.au8[ 8] = uSrc1.au8[12];
11030 uDstOut.au8[ 9] = uSrc2.au8[12];
11031 uDstOut.au8[10] = uSrc1.au8[13];
11032 uDstOut.au8[11] = uSrc2.au8[13];
11033 uDstOut.au8[12] = uSrc1.au8[14];
11034 uDstOut.au8[13] = uSrc2.au8[14];
11035 uDstOut.au8[14] = uSrc1.au8[15];
11036 uDstOut.au8[15] = uSrc2.au8[15];
11037 *puDst = uDstOut;
11038}
11039
11040#endif
11041
11042IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11043{
11044 RTUINT128U const uSrc2 = *puSrc2;
11045 RTUINT128U const uSrc1 = *puSrc1;
11046 ASMCompilerBarrier();
11047 RTUINT128U uDstOut;
11048 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11049 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11050 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11051 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11052 uDstOut.au8[ 4] = uSrc1.au8[10];
11053 uDstOut.au8[ 5] = uSrc2.au8[10];
11054 uDstOut.au8[ 6] = uSrc1.au8[11];
11055 uDstOut.au8[ 7] = uSrc2.au8[11];
11056 uDstOut.au8[ 8] = uSrc1.au8[12];
11057 uDstOut.au8[ 9] = uSrc2.au8[12];
11058 uDstOut.au8[10] = uSrc1.au8[13];
11059 uDstOut.au8[11] = uSrc2.au8[13];
11060 uDstOut.au8[12] = uSrc1.au8[14];
11061 uDstOut.au8[13] = uSrc2.au8[14];
11062 uDstOut.au8[14] = uSrc1.au8[15];
11063 uDstOut.au8[15] = uSrc2.au8[15];
11064 *puDst = uDstOut;
11065}
11066
11067
11068IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11069{
11070 RTUINT256U const uSrc2 = *puSrc2;
11071 RTUINT256U const uSrc1 = *puSrc1;
11072 ASMCompilerBarrier();
11073 RTUINT256U uDstOut;
11074 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11075 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11076 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11077 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11078 uDstOut.au8[ 4] = uSrc1.au8[10];
11079 uDstOut.au8[ 5] = uSrc2.au8[10];
11080 uDstOut.au8[ 6] = uSrc1.au8[11];
11081 uDstOut.au8[ 7] = uSrc2.au8[11];
11082 uDstOut.au8[ 8] = uSrc1.au8[12];
11083 uDstOut.au8[ 9] = uSrc2.au8[12];
11084 uDstOut.au8[10] = uSrc1.au8[13];
11085 uDstOut.au8[11] = uSrc2.au8[13];
11086 uDstOut.au8[12] = uSrc1.au8[14];
11087 uDstOut.au8[13] = uSrc2.au8[14];
11088 uDstOut.au8[14] = uSrc1.au8[15];
11089 uDstOut.au8[15] = uSrc2.au8[15];
11090 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11091 uDstOut.au8[16] = uSrc1.au8[24];
11092 uDstOut.au8[17] = uSrc2.au8[24];
11093 uDstOut.au8[18] = uSrc1.au8[25];
11094 uDstOut.au8[19] = uSrc2.au8[25];
11095 uDstOut.au8[20] = uSrc1.au8[26];
11096 uDstOut.au8[21] = uSrc2.au8[26];
11097 uDstOut.au8[22] = uSrc1.au8[27];
11098 uDstOut.au8[23] = uSrc2.au8[27];
11099 uDstOut.au8[24] = uSrc1.au8[28];
11100 uDstOut.au8[25] = uSrc2.au8[28];
11101 uDstOut.au8[26] = uSrc1.au8[29];
11102 uDstOut.au8[27] = uSrc2.au8[29];
11103 uDstOut.au8[28] = uSrc1.au8[30];
11104 uDstOut.au8[29] = uSrc2.au8[30];
11105 uDstOut.au8[30] = uSrc1.au8[31];
11106 uDstOut.au8[31] = uSrc2.au8[31];
11107 *puDst = uDstOut;
11108}
11109
11110
11111/*
11112 * PUNPCKHBW - high words -> dwords
11113 */
11114#ifdef IEM_WITHOUT_ASSEMBLY
11115
11116IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11117{
11118 RTUINT64U const uSrc2 = { *puSrc };
11119 RTUINT64U const uSrc1 = { *puDst };
11120 ASMCompilerBarrier();
11121 RTUINT64U uDstOut;
11122 uDstOut.au16[0] = uSrc1.au16[2];
11123 uDstOut.au16[1] = uSrc2.au16[2];
11124 uDstOut.au16[2] = uSrc1.au16[3];
11125 uDstOut.au16[3] = uSrc2.au16[3];
11126 *puDst = uDstOut.u;
11127}
11128
11129
11130IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11131{
11132 RTUINT128U const uSrc2 = *puSrc;
11133 RTUINT128U const uSrc1 = *puDst;
11134 ASMCompilerBarrier();
11135 RTUINT128U uDstOut;
11136 uDstOut.au16[0] = uSrc1.au16[4];
11137 uDstOut.au16[1] = uSrc2.au16[4];
11138 uDstOut.au16[2] = uSrc1.au16[5];
11139 uDstOut.au16[3] = uSrc2.au16[5];
11140 uDstOut.au16[4] = uSrc1.au16[6];
11141 uDstOut.au16[5] = uSrc2.au16[6];
11142 uDstOut.au16[6] = uSrc1.au16[7];
11143 uDstOut.au16[7] = uSrc2.au16[7];
11144 *puDst = uDstOut;
11145}
11146
11147#endif
11148
11149IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11150{
11151 RTUINT128U const uSrc2 = *puSrc2;
11152 RTUINT128U const uSrc1 = *puSrc1;
11153 ASMCompilerBarrier();
11154 RTUINT128U uDstOut;
11155 uDstOut.au16[0] = uSrc1.au16[4];
11156 uDstOut.au16[1] = uSrc2.au16[4];
11157 uDstOut.au16[2] = uSrc1.au16[5];
11158 uDstOut.au16[3] = uSrc2.au16[5];
11159 uDstOut.au16[4] = uSrc1.au16[6];
11160 uDstOut.au16[5] = uSrc2.au16[6];
11161 uDstOut.au16[6] = uSrc1.au16[7];
11162 uDstOut.au16[7] = uSrc2.au16[7];
11163 *puDst = uDstOut;
11164}
11165
11166
11167IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11168{
11169 RTUINT256U const uSrc2 = *puSrc2;
11170 RTUINT256U const uSrc1 = *puSrc1;
11171 ASMCompilerBarrier();
11172 RTUINT256U uDstOut;
11173 uDstOut.au16[0] = uSrc1.au16[4];
11174 uDstOut.au16[1] = uSrc2.au16[4];
11175 uDstOut.au16[2] = uSrc1.au16[5];
11176 uDstOut.au16[3] = uSrc2.au16[5];
11177 uDstOut.au16[4] = uSrc1.au16[6];
11178 uDstOut.au16[5] = uSrc2.au16[6];
11179 uDstOut.au16[6] = uSrc1.au16[7];
11180 uDstOut.au16[7] = uSrc2.au16[7];
11181
11182 uDstOut.au16[8] = uSrc1.au16[12];
11183 uDstOut.au16[9] = uSrc2.au16[12];
11184 uDstOut.au16[10] = uSrc1.au16[13];
11185 uDstOut.au16[11] = uSrc2.au16[13];
11186 uDstOut.au16[12] = uSrc1.au16[14];
11187 uDstOut.au16[13] = uSrc2.au16[14];
11188 uDstOut.au16[14] = uSrc1.au16[15];
11189 uDstOut.au16[15] = uSrc2.au16[15];
11190 *puDst = uDstOut;
11191}
11192
11193
11194/*
11195 * PUNPCKHBW - high dwords -> qword(s)
11196 */
11197#ifdef IEM_WITHOUT_ASSEMBLY
11198
11199IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11200{
11201 RTUINT64U const uSrc2 = { *puSrc };
11202 RTUINT64U const uSrc1 = { *puDst };
11203 ASMCompilerBarrier();
11204 RTUINT64U uDstOut;
11205 uDstOut.au32[0] = uSrc1.au32[1];
11206 uDstOut.au32[1] = uSrc2.au32[1];
11207 *puDst = uDstOut.u;
11208}
11209
11210
11211IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11212{
11213 RTUINT128U const uSrc2 = *puSrc;
11214 RTUINT128U const uSrc1 = *puDst;
11215 ASMCompilerBarrier();
11216 RTUINT128U uDstOut;
11217 uDstOut.au32[0] = uSrc1.au32[2];
11218 uDstOut.au32[1] = uSrc2.au32[2];
11219 uDstOut.au32[2] = uSrc1.au32[3];
11220 uDstOut.au32[3] = uSrc2.au32[3];
11221 *puDst = uDstOut;
11222}
11223
11224#endif
11225
11226IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11227{
11228 RTUINT128U const uSrc2 = *puSrc2;
11229 RTUINT128U const uSrc1 = *puSrc1;
11230 ASMCompilerBarrier();
11231 RTUINT128U uDstOut;
11232 uDstOut.au32[0] = uSrc1.au32[2];
11233 uDstOut.au32[1] = uSrc2.au32[2];
11234 uDstOut.au32[2] = uSrc1.au32[3];
11235 uDstOut.au32[3] = uSrc2.au32[3];
11236 *puDst = uDstOut;
11237}
11238
11239
11240IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11241{
11242 RTUINT256U const uSrc2 = *puSrc2;
11243 RTUINT256U const uSrc1 = *puSrc1;
11244 ASMCompilerBarrier();
11245 RTUINT256U uDstOut;
11246 uDstOut.au32[0] = uSrc1.au32[2];
11247 uDstOut.au32[1] = uSrc2.au32[2];
11248 uDstOut.au32[2] = uSrc1.au32[3];
11249 uDstOut.au32[3] = uSrc2.au32[3];
11250
11251 uDstOut.au32[4] = uSrc1.au32[6];
11252 uDstOut.au32[5] = uSrc2.au32[6];
11253 uDstOut.au32[6] = uSrc1.au32[7];
11254 uDstOut.au32[7] = uSrc2.au32[7];
11255 *puDst = uDstOut;
11256}
11257
11258
11259/*
11260 * PUNPCKHQDQ -> High qwords -> double qword(s).
11261 */
11262#ifdef IEM_WITHOUT_ASSEMBLY
11263IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11264{
11265 RTUINT128U const uSrc2 = *puSrc;
11266 RTUINT128U const uSrc1 = *puDst;
11267 ASMCompilerBarrier();
11268 RTUINT128U uDstOut;
11269 uDstOut.au64[0] = uSrc1.au64[1];
11270 uDstOut.au64[1] = uSrc2.au64[1];
11271 *puDst = uDstOut;
11272}
11273#endif
11274
11275
11276IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11277{
11278 RTUINT128U const uSrc2 = *puSrc2;
11279 RTUINT128U const uSrc1 = *puSrc1;
11280 ASMCompilerBarrier();
11281 RTUINT128U uDstOut;
11282 uDstOut.au64[0] = uSrc1.au64[1];
11283 uDstOut.au64[1] = uSrc2.au64[1];
11284 *puDst = uDstOut;
11285}
11286
11287
11288IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11289{
11290 RTUINT256U const uSrc2 = *puSrc2;
11291 RTUINT256U const uSrc1 = *puSrc1;
11292 ASMCompilerBarrier();
11293 RTUINT256U uDstOut;
11294 uDstOut.au64[0] = uSrc1.au64[1];
11295 uDstOut.au64[1] = uSrc2.au64[1];
11296
11297 uDstOut.au64[2] = uSrc1.au64[3];
11298 uDstOut.au64[3] = uSrc2.au64[3];
11299 *puDst = uDstOut;
11300}
11301
11302
11303/*
11304 * PUNPCKLBW - low bytes -> words
11305 */
11306#ifdef IEM_WITHOUT_ASSEMBLY
11307
11308IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11309{
11310 RTUINT64U const uSrc2 = { *puSrc };
11311 RTUINT64U const uSrc1 = { *puDst };
11312 ASMCompilerBarrier();
11313 RTUINT64U uDstOut;
11314 uDstOut.au8[0] = uSrc1.au8[0];
11315 uDstOut.au8[1] = uSrc2.au8[0];
11316 uDstOut.au8[2] = uSrc1.au8[1];
11317 uDstOut.au8[3] = uSrc2.au8[1];
11318 uDstOut.au8[4] = uSrc1.au8[2];
11319 uDstOut.au8[5] = uSrc2.au8[2];
11320 uDstOut.au8[6] = uSrc1.au8[3];
11321 uDstOut.au8[7] = uSrc2.au8[3];
11322 *puDst = uDstOut.u;
11323}
11324
11325
11326IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11327{
11328 RTUINT128U const uSrc2 = *puSrc;
11329 RTUINT128U const uSrc1 = *puDst;
11330 ASMCompilerBarrier();
11331 RTUINT128U uDstOut;
11332 uDstOut.au8[ 0] = uSrc1.au8[0];
11333 uDstOut.au8[ 1] = uSrc2.au8[0];
11334 uDstOut.au8[ 2] = uSrc1.au8[1];
11335 uDstOut.au8[ 3] = uSrc2.au8[1];
11336 uDstOut.au8[ 4] = uSrc1.au8[2];
11337 uDstOut.au8[ 5] = uSrc2.au8[2];
11338 uDstOut.au8[ 6] = uSrc1.au8[3];
11339 uDstOut.au8[ 7] = uSrc2.au8[3];
11340 uDstOut.au8[ 8] = uSrc1.au8[4];
11341 uDstOut.au8[ 9] = uSrc2.au8[4];
11342 uDstOut.au8[10] = uSrc1.au8[5];
11343 uDstOut.au8[11] = uSrc2.au8[5];
11344 uDstOut.au8[12] = uSrc1.au8[6];
11345 uDstOut.au8[13] = uSrc2.au8[6];
11346 uDstOut.au8[14] = uSrc1.au8[7];
11347 uDstOut.au8[15] = uSrc2.au8[7];
11348 *puDst = uDstOut;
11349}
11350
11351#endif
11352
11353IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11354{
11355 RTUINT128U const uSrc2 = *puSrc2;
11356 RTUINT128U const uSrc1 = *puSrc1;
11357 ASMCompilerBarrier();
11358 RTUINT128U uDstOut;
11359 uDstOut.au8[ 0] = uSrc1.au8[0];
11360 uDstOut.au8[ 1] = uSrc2.au8[0];
11361 uDstOut.au8[ 2] = uSrc1.au8[1];
11362 uDstOut.au8[ 3] = uSrc2.au8[1];
11363 uDstOut.au8[ 4] = uSrc1.au8[2];
11364 uDstOut.au8[ 5] = uSrc2.au8[2];
11365 uDstOut.au8[ 6] = uSrc1.au8[3];
11366 uDstOut.au8[ 7] = uSrc2.au8[3];
11367 uDstOut.au8[ 8] = uSrc1.au8[4];
11368 uDstOut.au8[ 9] = uSrc2.au8[4];
11369 uDstOut.au8[10] = uSrc1.au8[5];
11370 uDstOut.au8[11] = uSrc2.au8[5];
11371 uDstOut.au8[12] = uSrc1.au8[6];
11372 uDstOut.au8[13] = uSrc2.au8[6];
11373 uDstOut.au8[14] = uSrc1.au8[7];
11374 uDstOut.au8[15] = uSrc2.au8[7];
11375 *puDst = uDstOut;
11376}
11377
11378
11379IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11380{
11381 RTUINT256U const uSrc2 = *puSrc2;
11382 RTUINT256U const uSrc1 = *puSrc1;
11383 ASMCompilerBarrier();
11384 RTUINT256U uDstOut;
11385 uDstOut.au8[ 0] = uSrc1.au8[0];
11386 uDstOut.au8[ 1] = uSrc2.au8[0];
11387 uDstOut.au8[ 2] = uSrc1.au8[1];
11388 uDstOut.au8[ 3] = uSrc2.au8[1];
11389 uDstOut.au8[ 4] = uSrc1.au8[2];
11390 uDstOut.au8[ 5] = uSrc2.au8[2];
11391 uDstOut.au8[ 6] = uSrc1.au8[3];
11392 uDstOut.au8[ 7] = uSrc2.au8[3];
11393 uDstOut.au8[ 8] = uSrc1.au8[4];
11394 uDstOut.au8[ 9] = uSrc2.au8[4];
11395 uDstOut.au8[10] = uSrc1.au8[5];
11396 uDstOut.au8[11] = uSrc2.au8[5];
11397 uDstOut.au8[12] = uSrc1.au8[6];
11398 uDstOut.au8[13] = uSrc2.au8[6];
11399 uDstOut.au8[14] = uSrc1.au8[7];
11400 uDstOut.au8[15] = uSrc2.au8[7];
11401 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11402 uDstOut.au8[16] = uSrc1.au8[16];
11403 uDstOut.au8[17] = uSrc2.au8[16];
11404 uDstOut.au8[18] = uSrc1.au8[17];
11405 uDstOut.au8[19] = uSrc2.au8[17];
11406 uDstOut.au8[20] = uSrc1.au8[18];
11407 uDstOut.au8[21] = uSrc2.au8[18];
11408 uDstOut.au8[22] = uSrc1.au8[19];
11409 uDstOut.au8[23] = uSrc2.au8[19];
11410 uDstOut.au8[24] = uSrc1.au8[20];
11411 uDstOut.au8[25] = uSrc2.au8[20];
11412 uDstOut.au8[26] = uSrc1.au8[21];
11413 uDstOut.au8[27] = uSrc2.au8[21];
11414 uDstOut.au8[28] = uSrc1.au8[22];
11415 uDstOut.au8[29] = uSrc2.au8[22];
11416 uDstOut.au8[30] = uSrc1.au8[23];
11417 uDstOut.au8[31] = uSrc2.au8[23];
11418 *puDst = uDstOut;
11419}
11420
11421
11422/*
11423 * PUNPCKLBW - low words -> dwords
11424 */
11425#ifdef IEM_WITHOUT_ASSEMBLY
11426
11427IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11428{
11429 RTUINT64U const uSrc2 = { *puSrc };
11430 RTUINT64U const uSrc1 = { *puDst };
11431 ASMCompilerBarrier();
11432 RTUINT64U uDstOut;
11433 uDstOut.au16[0] = uSrc1.au16[0];
11434 uDstOut.au16[1] = uSrc2.au16[0];
11435 uDstOut.au16[2] = uSrc1.au16[1];
11436 uDstOut.au16[3] = uSrc2.au16[1];
11437 *puDst = uDstOut.u;
11438}
11439
11440
11441IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11442{
11443 RTUINT128U const uSrc2 = *puSrc;
11444 RTUINT128U const uSrc1 = *puDst;
11445 ASMCompilerBarrier();
11446 RTUINT128U uDstOut;
11447 uDstOut.au16[0] = uSrc1.au16[0];
11448 uDstOut.au16[1] = uSrc2.au16[0];
11449 uDstOut.au16[2] = uSrc1.au16[1];
11450 uDstOut.au16[3] = uSrc2.au16[1];
11451 uDstOut.au16[4] = uSrc1.au16[2];
11452 uDstOut.au16[5] = uSrc2.au16[2];
11453 uDstOut.au16[6] = uSrc1.au16[3];
11454 uDstOut.au16[7] = uSrc2.au16[3];
11455 *puDst = uDstOut;
11456}
11457
11458#endif
11459
11460IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11461{
11462 RTUINT128U const uSrc2 = *puSrc2;
11463 RTUINT128U const uSrc1 = *puSrc1;
11464 ASMCompilerBarrier();
11465 RTUINT128U uDstOut;
11466 uDstOut.au16[0] = uSrc1.au16[0];
11467 uDstOut.au16[1] = uSrc2.au16[0];
11468 uDstOut.au16[2] = uSrc1.au16[1];
11469 uDstOut.au16[3] = uSrc2.au16[1];
11470 uDstOut.au16[4] = uSrc1.au16[2];
11471 uDstOut.au16[5] = uSrc2.au16[2];
11472 uDstOut.au16[6] = uSrc1.au16[3];
11473 uDstOut.au16[7] = uSrc2.au16[3];
11474 *puDst = uDstOut;
11475}
11476
11477
11478IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11479{
11480 RTUINT256U const uSrc2 = *puSrc2;
11481 RTUINT256U const uSrc1 = *puSrc1;
11482 ASMCompilerBarrier();
11483 RTUINT256U uDstOut;
11484 uDstOut.au16[0] = uSrc1.au16[0];
11485 uDstOut.au16[1] = uSrc2.au16[0];
11486 uDstOut.au16[2] = uSrc1.au16[1];
11487 uDstOut.au16[3] = uSrc2.au16[1];
11488 uDstOut.au16[4] = uSrc1.au16[2];
11489 uDstOut.au16[5] = uSrc2.au16[2];
11490 uDstOut.au16[6] = uSrc1.au16[3];
11491 uDstOut.au16[7] = uSrc2.au16[3];
11492
11493 uDstOut.au16[8] = uSrc1.au16[8];
11494 uDstOut.au16[9] = uSrc2.au16[8];
11495 uDstOut.au16[10] = uSrc1.au16[9];
11496 uDstOut.au16[11] = uSrc2.au16[9];
11497 uDstOut.au16[12] = uSrc1.au16[10];
11498 uDstOut.au16[13] = uSrc2.au16[10];
11499 uDstOut.au16[14] = uSrc1.au16[11];
11500 uDstOut.au16[15] = uSrc2.au16[11];
11501 *puDst = uDstOut;
11502}
11503
11504
11505/*
11506 * PUNPCKLBW - low dwords -> qword(s)
11507 */
11508#ifdef IEM_WITHOUT_ASSEMBLY
11509
11510IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11511{
11512 RTUINT64U const uSrc2 = { *puSrc };
11513 RTUINT64U const uSrc1 = { *puDst };
11514 ASMCompilerBarrier();
11515 RTUINT64U uDstOut;
11516 uDstOut.au32[0] = uSrc1.au32[0];
11517 uDstOut.au32[1] = uSrc2.au32[0];
11518 *puDst = uDstOut.u;
11519}
11520
11521
11522IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11523{
11524 RTUINT128U const uSrc2 = *puSrc;
11525 RTUINT128U const uSrc1 = *puDst;
11526 ASMCompilerBarrier();
11527 RTUINT128U uDstOut;
11528 uDstOut.au32[0] = uSrc1.au32[0];
11529 uDstOut.au32[1] = uSrc2.au32[0];
11530 uDstOut.au32[2] = uSrc1.au32[1];
11531 uDstOut.au32[3] = uSrc2.au32[1];
11532 *puDst = uDstOut;
11533}
11534
11535#endif
11536
11537IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11538{
11539 RTUINT128U const uSrc2 = *puSrc2;
11540 RTUINT128U const uSrc1 = *puSrc1;
11541 ASMCompilerBarrier();
11542 RTUINT128U uDstOut;
11543 uDstOut.au32[0] = uSrc1.au32[0];
11544 uDstOut.au32[1] = uSrc2.au32[0];
11545 uDstOut.au32[2] = uSrc1.au32[1];
11546 uDstOut.au32[3] = uSrc2.au32[1];
11547 *puDst = uDstOut;
11548}
11549
11550
11551IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11552{
11553 RTUINT256U const uSrc2 = *puSrc2;
11554 RTUINT256U const uSrc1 = *puSrc1;
11555 ASMCompilerBarrier();
11556 RTUINT256U uDstOut;
11557 uDstOut.au32[0] = uSrc1.au32[0];
11558 uDstOut.au32[1] = uSrc2.au32[0];
11559 uDstOut.au32[2] = uSrc1.au32[1];
11560 uDstOut.au32[3] = uSrc2.au32[1];
11561
11562 uDstOut.au32[4] = uSrc1.au32[4];
11563 uDstOut.au32[5] = uSrc2.au32[4];
11564 uDstOut.au32[6] = uSrc1.au32[5];
11565 uDstOut.au32[7] = uSrc2.au32[5];
11566 *puDst = uDstOut;
11567}
11568
11569
11570/*
11571 * PUNPCKLQDQ -> Low qwords -> double qword(s).
11572 */
11573#ifdef IEM_WITHOUT_ASSEMBLY
11574IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11575{
11576 RTUINT128U const uSrc2 = *puSrc;
11577 RTUINT128U const uSrc1 = *puDst;
11578 ASMCompilerBarrier();
11579 RTUINT128U uDstOut;
11580 uDstOut.au64[0] = uSrc1.au64[0];
11581 uDstOut.au64[1] = uSrc2.au64[0];
11582 *puDst = uDstOut;
11583}
11584#endif
11585
11586
11587IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11588{
11589 RTUINT128U const uSrc2 = *puSrc2;
11590 RTUINT128U const uSrc1 = *puSrc1;
11591 ASMCompilerBarrier();
11592 RTUINT128U uDstOut;
11593 uDstOut.au64[0] = uSrc1.au64[0];
11594 uDstOut.au64[1] = uSrc2.au64[0];
11595 *puDst = uDstOut;
11596}
11597
11598
11599IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11600{
11601 RTUINT256U const uSrc2 = *puSrc2;
11602 RTUINT256U const uSrc1 = *puSrc1;
11603 ASMCompilerBarrier();
11604 RTUINT256U uDstOut;
11605 uDstOut.au64[0] = uSrc1.au64[0];
11606 uDstOut.au64[1] = uSrc2.au64[0];
11607
11608 uDstOut.au64[2] = uSrc1.au64[2];
11609 uDstOut.au64[3] = uSrc2.au64[2];
11610 *puDst = uDstOut;
11611}
11612
11613
11614/*
11615 * PACKSSWB - signed words -> signed bytes
11616 */
11617
11618#ifdef IEM_WITHOUT_ASSEMBLY
11619
11620IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11621{
11622 RTUINT64U const uSrc2 = { *puSrc };
11623 RTUINT64U const uSrc1 = { *puDst };
11624 ASMCompilerBarrier();
11625 RTUINT64U uDstOut;
11626 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11627 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11628 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11629 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11630 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11631 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11632 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11633 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11634 *puDst = uDstOut.u;
11635}
11636
11637
11638IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11639{
11640 RTUINT128U const uSrc2 = *puSrc;
11641 RTUINT128U const uSrc1 = *puDst;
11642 ASMCompilerBarrier();
11643 RTUINT128U uDstOut;
11644 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11645 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11646 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11647 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11648 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11649 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11650 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11651 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11652 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11653 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11654 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11655 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11656 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11657 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11658 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11659 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11660 *puDst = uDstOut;
11661}
11662
11663#endif
11664
11665IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11666{
11667 RTUINT128U const uSrc2 = *puSrc2;
11668 RTUINT128U const uSrc1 = *puSrc1;
11669 ASMCompilerBarrier();
11670 RTUINT128U uDstOut;
11671 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11672 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11673 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11674 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11675 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11676 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11677 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11678 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11679 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11680 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11681 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11682 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11683 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11684 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11685 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11686 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11687 *puDst = uDstOut;
11688}
11689
11690
11691IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11692{
11693 RTUINT256U const uSrc2 = *puSrc2;
11694 RTUINT256U const uSrc1 = *puSrc1;
11695 ASMCompilerBarrier();
11696 RTUINT256U uDstOut;
11697 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11698 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11699 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11700 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11701 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11702 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11703 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11704 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11705 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11706 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11707 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11708 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11709 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11710 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11711 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11712 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11713
11714 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
11715 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
11716 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
11717 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
11718 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
11719 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
11720 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
11721 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
11722 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
11723 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
11724 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
11725 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
11726 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
11727 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
11728 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
11729 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
11730 *puDst = uDstOut;
11731}
11732
11733
11734/*
11735 * PACKUSWB - signed words -> unsigned bytes
11736 */
11737#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
11738 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
11739 ? (uint8_t)(a_iWord) \
11740 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
11741
11742#ifdef IEM_WITHOUT_ASSEMBLY
11743
11744IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11745{
11746 RTUINT64U const uSrc2 = { *puSrc };
11747 RTUINT64U const uSrc1 = { *puDst };
11748 ASMCompilerBarrier();
11749 RTUINT64U uDstOut;
11750 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11751 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11752 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11753 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11754 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11755 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11756 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11757 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11758 *puDst = uDstOut.u;
11759}
11760
11761
11762IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11763{
11764 RTUINT128U const uSrc2 = *puSrc;
11765 RTUINT128U const uSrc1 = *puDst;
11766 ASMCompilerBarrier();
11767 RTUINT128U uDstOut;
11768 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11769 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11770 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11771 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11772 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11773 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11774 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11775 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11776 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11777 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11778 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11779 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11780 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11781 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11782 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11783 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11784 *puDst = uDstOut;
11785}
11786
11787#endif
11788
11789IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11790{
11791 RTUINT128U const uSrc2 = *puSrc2;
11792 RTUINT128U const uSrc1 = *puSrc1;
11793 ASMCompilerBarrier();
11794 RTUINT128U uDstOut;
11795 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11796 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11797 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11798 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11799 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11800 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11801 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11802 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11803 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11804 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11805 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11806 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11807 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11808 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11809 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11810 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11811 *puDst = uDstOut;
11812}
11813
11814
11815IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11816{
11817 RTUINT256U const uSrc2 = *puSrc2;
11818 RTUINT256U const uSrc1 = *puSrc1;
11819 ASMCompilerBarrier();
11820 RTUINT256U uDstOut;
11821 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11822 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11823 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11824 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11825 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11826 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11827 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11828 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11829 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11830 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11831 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11832 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11833 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11834 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11835 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11836 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11837
11838 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
11839 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
11840 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
11841 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
11842 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
11843 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
11844 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
11845 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
11846 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
11847 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
11848 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
11849 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
11850 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
11851 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
11852 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
11853 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
11854 *puDst = uDstOut;
11855}
11856
11857
11858/*
11859 * PACKSSDW - signed dwords -> signed words
11860 */
11861
11862#ifdef IEM_WITHOUT_ASSEMBLY
11863
11864IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11865{
11866 RTUINT64U const uSrc2 = { *puSrc };
11867 RTUINT64U const uSrc1 = { *puDst };
11868 ASMCompilerBarrier();
11869 RTUINT64U uDstOut;
11870 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11871 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11872 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11873 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11874 *puDst = uDstOut.u;
11875}
11876
11877
11878IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11879{
11880 RTUINT128U const uSrc2 = *puSrc;
11881 RTUINT128U const uSrc1 = *puDst;
11882 ASMCompilerBarrier();
11883 RTUINT128U uDstOut;
11884 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11885 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11886 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11887 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11888 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11889 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11890 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11891 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11892 *puDst = uDstOut;
11893}
11894
11895#endif
11896
11897IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11898{
11899 RTUINT128U const uSrc2 = *puSrc2;
11900 RTUINT128U const uSrc1 = *puSrc1;
11901 ASMCompilerBarrier();
11902 RTUINT128U uDstOut;
11903 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11904 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11905 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11906 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11907 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11908 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11909 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11910 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11911 *puDst = uDstOut;
11912}
11913
11914
11915IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11916{
11917 RTUINT256U const uSrc2 = *puSrc2;
11918 RTUINT256U const uSrc1 = *puSrc1;
11919 ASMCompilerBarrier();
11920 RTUINT256U uDstOut;
11921 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11922 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11923 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11924 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11925 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11926 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11927 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11928 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11929
11930 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
11931 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
11932 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
11933 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
11934 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
11935 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
11936 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
11937 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
11938 *puDst = uDstOut;
11939}
11940
11941
11942/*
11943 * PACKUSDW - signed dwords -> unsigned words
11944 */
11945#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
11946 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
11947 ? (uint16_t)(a_iDword) \
11948 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
11949
11950#ifdef IEM_WITHOUT_ASSEMBLY
11951IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11952{
11953 RTUINT128U const uSrc2 = *puSrc;
11954 RTUINT128U const uSrc1 = *puDst;
11955 ASMCompilerBarrier();
11956 RTUINT128U uDstOut;
11957 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11958 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11959 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11960 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11961 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11962 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11963 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
11964 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
11965 *puDst = uDstOut;
11966}
11967#endif
11968
11969IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11970{
11971 RTUINT128U const uSrc2 = *puSrc2;
11972 RTUINT128U const uSrc1 = *puSrc1;
11973 ASMCompilerBarrier();
11974 RTUINT128U uDstOut;
11975 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11976 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11977 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11978 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11979 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11980 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11981 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
11982 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
11983 *puDst = uDstOut;
11984}
11985
11986
11987IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11988{
11989 RTUINT256U const uSrc2 = *puSrc2;
11990 RTUINT256U const uSrc1 = *puSrc1;
11991 ASMCompilerBarrier();
11992 RTUINT256U uDstOut;
11993 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11994 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11995 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11996 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11997 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11998 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11999 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12000 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12001
12002 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12003 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12004 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12005 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12006 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12007 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12008 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12009 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12010 *puDst = uDstOut;
12011}
12012
12013
12014/*
12015 * [V]PABSB / [V]PABSW / [V]PABSD
12016 */
12017
12018IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12019{
12020 RTUINT64U const uSrc = { *puSrc };
12021 RTUINT64U uDstOut = { 0 };
12022
12023 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12024 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12025 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12026 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12027 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12028 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12029 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12030 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12031 *puDst = uDstOut.u;
12032 RT_NOREF(pFpuState);
12033}
12034
12035
12036IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12037{
12038 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12039 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12040 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12041 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12042 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12043 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12044 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12045 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12046 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12047 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12048 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12049 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12050 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12051 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12052 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12053 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12054 RT_NOREF(pFpuState);
12055}
12056
12057
12058IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12059{
12060 RTUINT64U const uSrc = { *puSrc };
12061 RTUINT64U uDstOut = { 0 };
12062
12063 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12064 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12065 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12066 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12067 *puDst = uDstOut.u;
12068 RT_NOREF(pFpuState);
12069}
12070
12071
12072IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12073{
12074 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12075 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12076 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12077 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12078 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12079 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12080 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12081 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12082 RT_NOREF(pFpuState);
12083}
12084
12085
12086IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12087{
12088 RTUINT64U const uSrc = { *puSrc };
12089 RTUINT64U uDstOut = { 0 };
12090
12091 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12092 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12093 *puDst = uDstOut.u;
12094 RT_NOREF(pFpuState);
12095}
12096
12097
12098IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12099{
12100 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12101 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12102 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12103 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12104 RT_NOREF(pFpuState);
12105}
12106
12107
12108IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12109{
12110 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12111 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12112 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12113 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12114 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12115 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12116 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12117 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12118 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12119 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12120 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12121 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12122 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12123 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12124 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12125 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12126}
12127
12128
12129IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12130{
12131 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12132 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12133 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12134 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12135 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12136 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12137 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12138 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12139 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12140 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12141 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12142 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12143 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12144 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12145 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12146 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12147 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12148 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12149 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12150 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12151 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12152 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12153 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12154 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12155 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12156 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12157 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12158 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12159 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12160 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12161 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12162 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12163}
12164
12165
12166IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12167{
12168 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12169 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12170 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12171 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12172 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12173 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12174 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12175 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12176}
12177
12178
12179IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12180{
12181 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12182 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12183 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12184 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12185 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12186 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12187 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12188 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12189 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12190 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12191 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12192 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12193 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12194 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12195 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12196 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12197}
12198
12199
12200IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12201{
12202 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12203 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12204 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12205 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12206}
12207
12208
12209IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12210{
12211 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12212 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12213 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12214 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12215 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12216 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12217 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12218 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12219}
12220
12221
12222/*
12223 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12224 */
12225IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12226{
12227 RTUINT64U uSrc1 = { *puDst };
12228 RTUINT64U uSrc2 = { *puSrc };
12229 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12230
12231 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12232 {
12233 if (uSrc2.ai8[i] < 0)
12234 uDst.ai8[i] = -uSrc1.ai8[i];
12235 else if (uSrc2.ai8[i] == 0)
12236 uDst.ai8[i] = 0;
12237 else /* uSrc2.ai8[i] > 0 */
12238 uDst.ai8[i] = uSrc1.ai8[i];
12239 }
12240
12241 *puDst = uDst.u;
12242 RT_NOREF(pFpuState);
12243}
12244
12245
12246IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12247{
12248 RTUINT128U uSrc1 = *puDst;
12249
12250 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12251 {
12252 if (puSrc->ai8[i] < 0)
12253 puDst->ai8[i] = -uSrc1.ai8[i];
12254 else if (puSrc->ai8[i] == 0)
12255 puDst->ai8[i] = 0;
12256 else /* puSrc->ai8[i] > 0 */
12257 puDst->ai8[i] = uSrc1.ai8[i];
12258 }
12259
12260 RT_NOREF(pFpuState);
12261}
12262
12263
12264IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12265{
12266 RTUINT64U uSrc1 = { *puDst };
12267 RTUINT64U uSrc2 = { *puSrc };
12268 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12269
12270 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12271 {
12272 if (uSrc2.ai16[i] < 0)
12273 uDst.ai16[i] = -uSrc1.ai16[i];
12274 else if (uSrc2.ai16[i] == 0)
12275 uDst.ai16[i] = 0;
12276 else /* uSrc2.ai16[i] > 0 */
12277 uDst.ai16[i] = uSrc1.ai16[i];
12278 }
12279
12280 *puDst = uDst.u;
12281 RT_NOREF(pFpuState);
12282}
12283
12284
12285IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12286{
12287 RTUINT128U uSrc1 = *puDst;
12288
12289 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12290 {
12291 if (puSrc->ai16[i] < 0)
12292 puDst->ai16[i] = -uSrc1.ai16[i];
12293 else if (puSrc->ai16[i] == 0)
12294 puDst->ai16[i] = 0;
12295 else /* puSrc->ai16[i] > 0 */
12296 puDst->ai16[i] = uSrc1.ai16[i];
12297 }
12298
12299 RT_NOREF(pFpuState);
12300}
12301
12302
12303IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12304{
12305 RTUINT64U uSrc1 = { *puDst };
12306 RTUINT64U uSrc2 = { *puSrc };
12307 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12308
12309 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12310 {
12311 if (uSrc2.ai32[i] < 0)
12312 uDst.ai32[i] = -uSrc1.ai32[i];
12313 else if (uSrc2.ai32[i] == 0)
12314 uDst.ai32[i] = 0;
12315 else /* uSrc2.ai32[i] > 0 */
12316 uDst.ai32[i] = uSrc1.ai32[i];
12317 }
12318
12319 *puDst = uDst.u;
12320 RT_NOREF(pFpuState);
12321}
12322
12323
12324IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12325{
12326 RTUINT128U uSrc1 = *puDst;
12327
12328 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12329 {
12330 if (puSrc->ai32[i] < 0)
12331 puDst->ai32[i] = -uSrc1.ai32[i];
12332 else if (puSrc->ai32[i] == 0)
12333 puDst->ai32[i] = 0;
12334 else /* puSrc->ai32[i] > 0 */
12335 puDst->ai32[i] = uSrc1.ai32[i];
12336 }
12337
12338 RT_NOREF(pFpuState);
12339}
12340
12341
12342IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12343{
12344 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12345 {
12346 if (puSrc2->ai8[i] < 0)
12347 puDst->ai8[i] = -puSrc1->ai8[i];
12348 else if (puSrc2->ai8[i] == 0)
12349 puDst->ai8[i] = 0;
12350 else /* puSrc2->ai8[i] > 0 */
12351 puDst->ai8[i] = puSrc1->ai8[i];
12352 }
12353}
12354
12355
12356IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12357{
12358 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12359 {
12360 if (puSrc2->ai8[i] < 0)
12361 puDst->ai8[i] = -puSrc1->ai8[i];
12362 else if (puSrc2->ai8[i] == 0)
12363 puDst->ai8[i] = 0;
12364 else /* puSrc2->ai8[i] > 0 */
12365 puDst->ai8[i] = puSrc1->ai8[i];
12366 }
12367}
12368
12369
12370IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12371{
12372 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12373 {
12374 if (puSrc2->ai16[i] < 0)
12375 puDst->ai16[i] = -puSrc1->ai16[i];
12376 else if (puSrc2->ai16[i] == 0)
12377 puDst->ai16[i] = 0;
12378 else /* puSrc2->ai16[i] > 0 */
12379 puDst->ai16[i] = puSrc1->ai16[i];
12380 }
12381}
12382
12383
12384IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12385{
12386 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12387 {
12388 if (puSrc2->ai16[i] < 0)
12389 puDst->ai16[i] = -puSrc1->ai16[i];
12390 else if (puSrc2->ai16[i] == 0)
12391 puDst->ai16[i] = 0;
12392 else /* puSrc2->ai16[i] > 0 */
12393 puDst->ai16[i] = puSrc1->ai16[i];
12394 }
12395}
12396
12397
12398IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12399{
12400 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12401 {
12402 if (puSrc2->ai32[i] < 0)
12403 puDst->ai32[i] = -puSrc1->ai32[i];
12404 else if (puSrc2->ai32[i] == 0)
12405 puDst->ai32[i] = 0;
12406 else /* puSrc2->ai32[i] > 0 */
12407 puDst->ai32[i] = puSrc1->ai32[i];
12408 }
12409}
12410
12411
12412IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12413{
12414 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12415 {
12416 if (puSrc2->ai32[i] < 0)
12417 puDst->ai32[i] = -puSrc1->ai32[i];
12418 else if (puSrc2->ai32[i] == 0)
12419 puDst->ai32[i] = 0;
12420 else /* puSrc2->ai32[i] > 0 */
12421 puDst->ai32[i] = puSrc1->ai32[i];
12422 }
12423}
12424
12425
12426/*
12427 * PHADDW / VPHADDW / PHADDD / VPHADDD
12428 */
12429IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12430{
12431 RTUINT64U uSrc1 = { *puDst };
12432 RTUINT64U uSrc2 = { *puSrc };
12433 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12434
12435 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12436 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12437 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
12438 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
12439 *puDst = uDst.u;
12440 RT_NOREF(pFpuState);
12441}
12442
12443
12444IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12445{
12446 RTUINT128U uSrc1 = *puDst;
12447
12448 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12449 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12450 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
12451 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
12452
12453 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
12454 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
12455 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
12456 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
12457 RT_NOREF(pFpuState);
12458}
12459
12460
12461IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12462{
12463 RTUINT64U uSrc1 = { *puDst };
12464 RTUINT64U uSrc2 = { *puSrc };
12465 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12466
12467 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12468 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
12469 *puDst = uDst.u;
12470 RT_NOREF(pFpuState);
12471}
12472
12473
12474IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12475{
12476 RTUINT128U uSrc1 = *puDst;
12477
12478 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12479 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
12480
12481 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
12482 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
12483 RT_NOREF(pFpuState);
12484}
12485
12486
12487IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12488{
12489 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12490
12491 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
12492 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
12493 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
12494 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
12495
12496 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
12497 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
12498 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
12499 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
12500
12501 puDst->au64[0] = uDst.au64[0];
12502 puDst->au64[1] = uDst.au64[1];
12503}
12504
12505
12506IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12507{
12508 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12509
12510 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
12511 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
12512 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
12513 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
12514 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
12515 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
12516 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
12517 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
12518
12519 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
12520 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
12521 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
12522 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
12523 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
12524 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
12525 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
12526 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
12527
12528 puDst->au64[0] = uDst.au64[0];
12529 puDst->au64[1] = uDst.au64[1];
12530 puDst->au64[2] = uDst.au64[2];
12531 puDst->au64[3] = uDst.au64[3];
12532}
12533
12534
12535IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12536{
12537 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12538
12539 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
12540 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
12541
12542 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
12543 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
12544
12545 puDst->au64[0] = uDst.au64[0];
12546 puDst->au64[1] = uDst.au64[1];
12547}
12548
12549
12550IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12551{
12552 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12553
12554 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
12555 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
12556 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
12557 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
12558
12559 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
12560 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
12561 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
12562 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
12563
12564 puDst->au64[0] = uDst.au64[0];
12565 puDst->au64[1] = uDst.au64[1];
12566 puDst->au64[2] = uDst.au64[2];
12567 puDst->au64[3] = uDst.au64[3];
12568}
12569
12570
12571/*
12572 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
12573 */
12574IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12575{
12576 RTUINT64U uSrc1 = { *puDst };
12577 RTUINT64U uSrc2 = { *puSrc };
12578 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12579
12580 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
12581 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
12582 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
12583 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
12584 *puDst = uDst.u;
12585 RT_NOREF(pFpuState);
12586}
12587
12588
12589IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12590{
12591 RTUINT128U uSrc1 = *puDst;
12592
12593 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
12594 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
12595 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
12596 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
12597
12598 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
12599 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
12600 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
12601 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
12602 RT_NOREF(pFpuState);
12603}
12604
12605
12606IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12607{
12608 RTUINT64U uSrc1 = { *puDst };
12609 RTUINT64U uSrc2 = { *puSrc };
12610 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12611
12612 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
12613 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
12614 *puDst = uDst.u;
12615 RT_NOREF(pFpuState);
12616}
12617
12618
12619IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12620{
12621 RTUINT128U uSrc1 = *puDst;
12622
12623 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
12624 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
12625
12626 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
12627 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
12628 RT_NOREF(pFpuState);
12629}
12630
12631
12632IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12633{
12634 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12635
12636 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
12637 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
12638 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
12639 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
12640
12641 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
12642 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
12643 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
12644 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
12645
12646 puDst->au64[0] = uDst.au64[0];
12647 puDst->au64[1] = uDst.au64[1];
12648}
12649
12650
12651IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12652{
12653 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12654
12655 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
12656 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
12657 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
12658 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
12659 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
12660 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
12661 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
12662 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
12663
12664 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
12665 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
12666 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
12667 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
12668 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
12669 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
12670 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
12671 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
12672
12673 puDst->au64[0] = uDst.au64[0];
12674 puDst->au64[1] = uDst.au64[1];
12675 puDst->au64[2] = uDst.au64[2];
12676 puDst->au64[3] = uDst.au64[3];
12677}
12678
12679
12680IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12681{
12682 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12683
12684 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
12685 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
12686
12687 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
12688 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
12689
12690 puDst->au64[0] = uDst.au64[0];
12691 puDst->au64[1] = uDst.au64[1];
12692}
12693
12694
12695IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12696{
12697 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12698
12699 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
12700 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
12701 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
12702 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
12703
12704 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
12705 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
12706 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
12707 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
12708
12709 puDst->au64[0] = uDst.au64[0];
12710 puDst->au64[1] = uDst.au64[1];
12711 puDst->au64[2] = uDst.au64[2];
12712 puDst->au64[3] = uDst.au64[3];
12713}
12714
12715
12716/*
12717 * PHADDSW / VPHADDSW
12718 */
12719IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12720{
12721 RTUINT64U uSrc1 = { *puDst };
12722 RTUINT64U uSrc2 = { *puSrc };
12723 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12724
12725 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
12726 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
12727 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
12728 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
12729 *puDst = uDst.u;
12730 RT_NOREF(pFpuState);
12731}
12732
12733
12734IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12735{
12736 RTUINT128U uSrc1 = *puDst;
12737
12738 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
12739 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
12740 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
12741 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
12742
12743 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
12744 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
12745 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
12746 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
12747 RT_NOREF(pFpuState);
12748}
12749
12750
12751IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12752{
12753 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12754
12755 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
12756 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
12757 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
12758 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
12759
12760 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
12761 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
12762 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
12763 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
12764
12765 puDst->au64[0] = uDst.au64[0];
12766 puDst->au64[1] = uDst.au64[1];
12767}
12768
12769
12770IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12771{
12772 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12773
12774 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
12775 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
12776 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
12777 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
12778 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
12779 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
12780 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
12781 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
12782
12783 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
12784 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
12785 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
12786 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
12787 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
12788 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
12789 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
12790 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
12791
12792 puDst->au64[0] = uDst.au64[0];
12793 puDst->au64[1] = uDst.au64[1];
12794 puDst->au64[2] = uDst.au64[2];
12795 puDst->au64[3] = uDst.au64[3];
12796}
12797
12798
12799/*
12800 * PHSUBSW / VPHSUBSW
12801 */
12802IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12803{
12804 RTUINT64U uSrc1 = { *puDst };
12805 RTUINT64U uSrc2 = { *puSrc };
12806 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12807
12808 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
12809 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
12810 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
12811 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
12812 *puDst = uDst.u;
12813 RT_NOREF(pFpuState);
12814}
12815
12816
12817IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12818{
12819 RTUINT128U uSrc1 = *puDst;
12820
12821 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
12822 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
12823 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
12824 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
12825
12826 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
12827 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
12828 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
12829 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
12830 RT_NOREF(pFpuState);
12831}
12832
12833
12834IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12835{
12836 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12837
12838 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
12839 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
12840 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
12841 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
12842
12843 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
12844 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
12845 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
12846 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
12847
12848 puDst->au64[0] = uDst.au64[0];
12849 puDst->au64[1] = uDst.au64[1];
12850}
12851
12852
12853IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12854{
12855 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12856
12857 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
12858 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
12859 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
12860 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
12861 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
12862 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
12863 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
12864 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
12865
12866 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
12867 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
12868 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
12869 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
12870 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
12871 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
12872 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
12873 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
12874
12875 puDst->au64[0] = uDst.au64[0];
12876 puDst->au64[1] = uDst.au64[1];
12877 puDst->au64[2] = uDst.au64[2];
12878 puDst->au64[3] = uDst.au64[3];
12879}
12880
12881
12882/*
12883 * PMADDUBSW / VPMADDUBSW
12884 */
12885IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12886{
12887 RTUINT64U uSrc1 = { *puDst };
12888 RTUINT64U uSrc2 = { *puSrc };
12889 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12890
12891 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
12892 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
12893 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
12894 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
12895 *puDst = uDst.u;
12896 RT_NOREF(pFpuState);
12897}
12898
12899
12900IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12901{
12902 RTUINT128U uSrc1 = *puDst;
12903
12904 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
12905 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
12906 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
12907 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
12908 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
12909 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
12910 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
12911 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
12912 RT_NOREF(pFpuState);
12913}
12914
12915
12916IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12917{
12918 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12919
12920 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
12921 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
12922 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
12923 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
12924 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
12925 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
12926 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
12927 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
12928
12929 puDst->au64[0] = uDst.au64[0];
12930 puDst->au64[1] = uDst.au64[1];
12931}
12932
12933
12934IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12935{
12936 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
12937
12938 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
12939 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
12940 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
12941 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
12942 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
12943 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
12944 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
12945 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
12946 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
12947 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
12948 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
12949 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
12950 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
12951 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
12952 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
12953 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
12954
12955 puDst->au64[0] = uDst.au64[0];
12956 puDst->au64[1] = uDst.au64[1];
12957 puDst->au64[2] = uDst.au64[2];
12958 puDst->au64[3] = uDst.au64[3];
12959}
12960
12961
12962/*
12963 * PMULHRSW / VPMULHRSW
12964 */
12965#define DO_PMULHRSW(a_Src1, a_Src2) \
12966 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
12967
12968IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12969{
12970 RTUINT64U uSrc1 = { *puDst };
12971 RTUINT64U uSrc2 = { *puSrc };
12972 RTUINT64U uDst;
12973
12974 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
12975 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
12976 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
12977 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
12978 *puDst = uDst.u;
12979 RT_NOREF(pFpuState);
12980}
12981
12982
12983IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12984{
12985 RTUINT128U uSrc1 = *puDst;
12986
12987 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
12988 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
12989 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
12990 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
12991 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
12992 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
12993 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
12994 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
12995 RT_NOREF(pFpuState);
12996}
12997
12998
12999IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13000{
13001 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13002
13003 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13004 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13005 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13006 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13007 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13008 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13009 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13010 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13011
13012 puDst->au64[0] = uDst.au64[0];
13013 puDst->au64[1] = uDst.au64[1];
13014}
13015
13016
13017IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13018{
13019 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13020
13021 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13022 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13023 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13024 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13025 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13026 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13027 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13028 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13029 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13030 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13031 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13032 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13033 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13034 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13035 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13036 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13037
13038 puDst->au64[0] = uDst.au64[0];
13039 puDst->au64[1] = uDst.au64[1];
13040 puDst->au64[2] = uDst.au64[2];
13041 puDst->au64[3] = uDst.au64[3];
13042}
13043
13044
13045/*
13046 * PSADBW / VPSADBW
13047 */
13048#ifdef IEM_WITHOUT_ASSEMBLY
13049
13050IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13051{
13052 RTUINT64U uSrc1 = { *puDst };
13053 RTUINT64U uSrc2 = { *puSrc };
13054 RTUINT64U uDst;
13055 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13056 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13057 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13058 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13059 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13060 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13061 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13062 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13063
13064 uDst.au64[0] = 0;
13065 uDst.au16[0] = uSum;
13066 *puDst = uDst.u;
13067}
13068
13069
13070IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13071{
13072 RTUINT128U uSrc1 = *puDst;
13073
13074 puDst->au64[0] = 0;
13075 puDst->au64[1] = 0;
13076
13077 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13078 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13079 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13080 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13081 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13082 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13083 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13084 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13085 puDst->au16[0] = uSum;
13086
13087 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13088 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13089 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13090 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13091 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13092 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13093 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13094 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13095 puDst->au16[4] = uSum;
13096}
13097
13098#endif
13099
13100IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13101{
13102 RTUINT128U uSrc1 = *puSrc1;
13103 RTUINT128U uSrc2 = *puSrc2;
13104
13105 puDst->au64[0] = 0;
13106 puDst->au64[1] = 0;
13107
13108 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13109 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13110 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13111 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13112 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13113 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13114 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13115 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13116 puDst->au16[0] = uSum;
13117
13118 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13119 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13120 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13121 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13122 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13123 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13124 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13125 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13126 puDst->au16[4] = uSum;
13127}
13128
13129IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13130{
13131 RTUINT256U uSrc1 = *puSrc1;
13132 RTUINT256U uSrc2 = *puSrc2;
13133
13134 puDst->au64[0] = 0;
13135 puDst->au64[1] = 0;
13136 puDst->au64[2] = 0;
13137 puDst->au64[3] = 0;
13138
13139 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13140 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13141 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13142 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13143 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13144 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13145 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13146 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13147 puDst->au16[0] = uSum;
13148
13149 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13150 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13151 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13152 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13153 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13154 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13155 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13156 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13157 puDst->au16[4] = uSum;
13158
13159 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13160 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13161 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13162 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13163 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13164 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13165 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13166 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13167 puDst->au16[8] = uSum;
13168
13169 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13170 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13171 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13172 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13173 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13174 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13175 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13176 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13177 puDst->au16[12] = uSum;
13178}
13179
13180
13181/*
13182 * PMULDQ / VPMULDQ
13183 */
13184IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13185{
13186 RTUINT128U uSrc1 = *puDst;
13187
13188 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13189 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13190}
13191
13192IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13193{
13194 RTUINT128U uSrc1 = *puSrc1;
13195 RTUINT128U uSrc2 = *puSrc2;
13196
13197 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13198 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13199}
13200
13201IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13202{
13203 RTUINT256U uSrc1 = *puSrc1;
13204 RTUINT256U uSrc2 = *puSrc2;
13205
13206 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13207 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13208 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13209 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13210}
13211
13212
13213/*
13214 * PMULUDQ / VPMULUDQ
13215 */
13216#ifdef IEM_WITHOUT_ASSEMBLY
13217
13218IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13219{
13220 RTUINT64U uSrc1 = { *puDst };
13221 RTUINT64U uSrc2 = { *puSrc };
13222 ASMCompilerBarrier();
13223 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13224 RT_NOREF(pFpuState);
13225}
13226
13227
13228IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13229{
13230 RTUINT128U uSrc1 = *puDst;
13231 RTUINT128U uSrc2 = *puSrc;
13232 ASMCompilerBarrier();
13233 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13234 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13235 RT_NOREF(pFpuState);
13236}
13237
13238#endif
13239
13240IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13241{
13242 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13243 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13244 ASMCompilerBarrier();
13245 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13246 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13247}
13248
13249
13250IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13251{
13252 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13253 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13254 ASMCompilerBarrier();
13255 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13256 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13257 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13258 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13259}
13260
13261
13262/*
13263 * UNPCKLPS / VUNPCKLPS
13264 */
13265#ifdef IEM_WITHOUT_ASSEMBLY
13266IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13267{
13268 RTUINT128U uSrc1 = *puDst;
13269 RTUINT128U uSrc2 = *puSrc;
13270 ASMCompilerBarrier();
13271 puDst->au32[0] = uSrc1.au32[0];
13272 puDst->au32[1] = uSrc2.au32[0];
13273 puDst->au32[2] = uSrc1.au32[1];
13274 puDst->au32[3] = uSrc2.au32[1];
13275}
13276
13277#endif
13278
13279IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13280{
13281 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13282 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13283 ASMCompilerBarrier();
13284 puDst->au32[0] = uSrc1.au32[0];
13285 puDst->au32[1] = uSrc2.au32[0];
13286 puDst->au32[2] = uSrc1.au32[1];
13287 puDst->au32[3] = uSrc2.au32[1];
13288}
13289
13290
13291IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13292{
13293 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13294 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13295 ASMCompilerBarrier();
13296 puDst->au32[0] = uSrc1.au32[0];
13297 puDst->au32[1] = uSrc2.au32[0];
13298 puDst->au32[2] = uSrc1.au32[1];
13299 puDst->au32[3] = uSrc2.au32[1];
13300
13301 puDst->au32[4] = uSrc1.au32[4];
13302 puDst->au32[5] = uSrc2.au32[4];
13303 puDst->au32[6] = uSrc1.au32[5];
13304 puDst->au32[7] = uSrc2.au32[5];
13305}
13306
13307
13308/*
13309 * UNPCKLPD / VUNPCKLPD
13310 */
13311#ifdef IEM_WITHOUT_ASSEMBLY
13312IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13313{
13314 RTUINT128U uSrc1 = *puDst;
13315 RTUINT128U uSrc2 = *puSrc;
13316 ASMCompilerBarrier();
13317 puDst->au64[0] = uSrc1.au64[0];
13318 puDst->au64[1] = uSrc2.au64[0];
13319}
13320
13321#endif
13322
13323IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13324{
13325 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13326 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13327 ASMCompilerBarrier();
13328 puDst->au64[0] = uSrc1.au64[0];
13329 puDst->au64[1] = uSrc2.au64[0];
13330}
13331
13332
13333IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13334{
13335 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13336 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13337 ASMCompilerBarrier();
13338 puDst->au64[0] = uSrc1.au64[0];
13339 puDst->au64[1] = uSrc2.au64[0];
13340 puDst->au64[2] = uSrc1.au64[2];
13341 puDst->au64[3] = uSrc2.au64[2];
13342}
13343
13344
13345/*
13346 * UNPCKHPS / VUNPCKHPS
13347 */
13348#ifdef IEM_WITHOUT_ASSEMBLY
13349IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13350{
13351 RTUINT128U uSrc1 = *puDst;
13352 RTUINT128U uSrc2 = *puSrc;
13353 ASMCompilerBarrier();
13354 puDst->au32[0] = uSrc1.au32[2];
13355 puDst->au32[1] = uSrc2.au32[2];
13356 puDst->au32[2] = uSrc1.au32[3];
13357 puDst->au32[3] = uSrc2.au32[3];
13358}
13359
13360#endif
13361
13362IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13363{
13364 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13365 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13366 ASMCompilerBarrier();
13367 puDst->au32[0] = uSrc1.au32[2];
13368 puDst->au32[1] = uSrc2.au32[2];
13369 puDst->au32[2] = uSrc1.au32[3];
13370 puDst->au32[3] = uSrc2.au32[3];
13371}
13372
13373
13374IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13375{
13376 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13377 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13378 ASMCompilerBarrier();
13379 puDst->au32[0] = uSrc1.au32[2];
13380 puDst->au32[1] = uSrc2.au32[2];
13381 puDst->au32[2] = uSrc1.au32[3];
13382 puDst->au32[3] = uSrc2.au32[3];
13383
13384 puDst->au32[4] = uSrc1.au32[6];
13385 puDst->au32[5] = uSrc2.au32[6];
13386 puDst->au32[6] = uSrc1.au32[7];
13387 puDst->au32[7] = uSrc2.au32[7];
13388}
13389
13390
13391/*
13392 * UNPCKHPD / VUNPCKHPD
13393 */
13394#ifdef IEM_WITHOUT_ASSEMBLY
13395IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13396{
13397 RTUINT128U uSrc1 = *puDst;
13398 RTUINT128U uSrc2 = *puSrc;
13399 ASMCompilerBarrier();
13400 puDst->au64[0] = uSrc1.au64[1];
13401 puDst->au64[1] = uSrc2.au64[1];
13402}
13403
13404#endif
13405
13406IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13407{
13408 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13409 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13410 ASMCompilerBarrier();
13411 puDst->au64[0] = uSrc1.au64[1];
13412 puDst->au64[1] = uSrc2.au64[1];
13413}
13414
13415
13416IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13417{
13418 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13419 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13420 ASMCompilerBarrier();
13421 puDst->au64[0] = uSrc1.au64[1];
13422 puDst->au64[1] = uSrc2.au64[1];
13423 puDst->au64[2] = uSrc1.au64[3];
13424 puDst->au64[3] = uSrc2.au64[3];
13425}
13426
13427
13428/*
13429 * CRC32 (SEE 4.2).
13430 */
13431
13432IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
13433{
13434 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13435}
13436
13437
13438IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
13439{
13440 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13441}
13442
13443IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
13444{
13445 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13446}
13447
13448IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
13449{
13450 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13451}
13452
13453
13454/*
13455 * PTEST (SSE 4.1) - special as it output only EFLAGS.
13456 */
13457#ifdef IEM_WITHOUT_ASSEMBLY
13458IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
13459{
13460 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13461 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13462 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13463 fEfl |= X86_EFL_ZF;
13464 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13465 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13466 fEfl |= X86_EFL_CF;
13467 *pfEFlags = fEfl;
13468}
13469#endif
13470
13471IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
13472{
13473 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13474 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13475 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
13476 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
13477 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
13478 fEfl |= X86_EFL_ZF;
13479 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13480 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
13481 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
13482 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
13483 fEfl |= X86_EFL_CF;
13484 *pfEFlags = fEfl;
13485}
13486
13487
13488/*
13489 * PMOVSXBW / VPMOVSXBW
13490 */
13491IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13492{
13493 RTUINT64U uSrc1 = { uSrc };
13494 puDst->ai16[0] = uSrc1.ai8[0];
13495 puDst->ai16[1] = uSrc1.ai8[1];
13496 puDst->ai16[2] = uSrc1.ai8[2];
13497 puDst->ai16[3] = uSrc1.ai8[3];
13498 puDst->ai16[4] = uSrc1.ai8[4];
13499 puDst->ai16[5] = uSrc1.ai8[5];
13500 puDst->ai16[6] = uSrc1.ai8[6];
13501 puDst->ai16[7] = uSrc1.ai8[7];
13502}
13503
13504
13505IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13506{
13507 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13508 puDst->ai16[ 0] = uSrc1.ai8[ 0];
13509 puDst->ai16[ 1] = uSrc1.ai8[ 1];
13510 puDst->ai16[ 2] = uSrc1.ai8[ 2];
13511 puDst->ai16[ 3] = uSrc1.ai8[ 3];
13512 puDst->ai16[ 4] = uSrc1.ai8[ 4];
13513 puDst->ai16[ 5] = uSrc1.ai8[ 5];
13514 puDst->ai16[ 6] = uSrc1.ai8[ 6];
13515 puDst->ai16[ 7] = uSrc1.ai8[ 7];
13516 puDst->ai16[ 8] = uSrc1.ai8[ 8];
13517 puDst->ai16[ 9] = uSrc1.ai8[ 9];
13518 puDst->ai16[10] = uSrc1.ai8[10];
13519 puDst->ai16[11] = uSrc1.ai8[11];
13520 puDst->ai16[12] = uSrc1.ai8[12];
13521 puDst->ai16[13] = uSrc1.ai8[13];
13522 puDst->ai16[14] = uSrc1.ai8[14];
13523 puDst->ai16[15] = uSrc1.ai8[15];
13524}
13525
13526
13527/*
13528 * PMOVSXBD / VPMOVSXBD
13529 */
13530IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
13531{
13532 RTUINT32U uSrc1 = { uSrc };
13533 puDst->ai32[0] = uSrc1.ai8[0];
13534 puDst->ai32[1] = uSrc1.ai8[1];
13535 puDst->ai32[2] = uSrc1.ai8[2];
13536 puDst->ai32[3] = uSrc1.ai8[3];
13537}
13538
13539
13540IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13541{
13542 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13543 puDst->ai32[0] = uSrc1.ai8[0];
13544 puDst->ai32[1] = uSrc1.ai8[1];
13545 puDst->ai32[2] = uSrc1.ai8[2];
13546 puDst->ai32[3] = uSrc1.ai8[3];
13547 puDst->ai32[4] = uSrc1.ai8[4];
13548 puDst->ai32[5] = uSrc1.ai8[5];
13549 puDst->ai32[6] = uSrc1.ai8[6];
13550 puDst->ai32[7] = uSrc1.ai8[7];
13551}
13552
13553
13554/*
13555 * PMOVSXBQ / VPMOVSXBQ
13556 */
13557IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
13558{
13559 RTUINT16U uSrc1 = { uSrc };
13560 puDst->ai64[0] = uSrc1.ai8[0];
13561 puDst->ai64[1] = uSrc1.ai8[1];
13562}
13563
13564
13565IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13566{
13567 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13568 puDst->ai64[0] = uSrc1.ai8[0];
13569 puDst->ai64[1] = uSrc1.ai8[1];
13570 puDst->ai64[2] = uSrc1.ai8[2];
13571 puDst->ai64[3] = uSrc1.ai8[3];
13572}
13573
13574
13575/*
13576 * PMOVSXWD / VPMOVSXWD
13577 */
13578IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13579{
13580 RTUINT64U uSrc1 = { uSrc };
13581 puDst->ai32[0] = uSrc1.ai16[0];
13582 puDst->ai32[1] = uSrc1.ai16[1];
13583 puDst->ai32[2] = uSrc1.ai16[2];
13584 puDst->ai32[3] = uSrc1.ai16[3];
13585}
13586
13587
13588IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13589{
13590 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13591 puDst->ai32[0] = uSrc1.ai16[0];
13592 puDst->ai32[1] = uSrc1.ai16[1];
13593 puDst->ai32[2] = uSrc1.ai16[2];
13594 puDst->ai32[3] = uSrc1.ai16[3];
13595 puDst->ai32[4] = uSrc1.ai16[4];
13596 puDst->ai32[5] = uSrc1.ai16[5];
13597 puDst->ai32[6] = uSrc1.ai16[6];
13598 puDst->ai32[7] = uSrc1.ai16[7];
13599}
13600
13601
13602/*
13603 * PMOVSXWQ / VPMOVSXWQ
13604 */
13605IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
13606{
13607 RTUINT32U uSrc1 = { uSrc };
13608 puDst->ai64[0] = uSrc1.ai16[0];
13609 puDst->ai64[1] = uSrc1.ai16[1];
13610}
13611
13612
13613IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13614{
13615 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13616 puDst->ai64[0] = uSrc1.ai16[0];
13617 puDst->ai64[1] = uSrc1.ai16[1];
13618 puDst->ai64[2] = uSrc1.ai16[2];
13619 puDst->ai64[3] = uSrc1.ai16[3];
13620}
13621
13622
13623/*
13624 * PMOVSXDQ / VPMOVSXDQ
13625 */
13626IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13627{
13628 RTUINT64U uSrc1 = { uSrc };
13629 puDst->ai64[0] = uSrc1.ai32[0];
13630 puDst->ai64[1] = uSrc1.ai32[1];
13631}
13632
13633
13634IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13635{
13636 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13637 puDst->ai64[0] = uSrc1.ai32[0];
13638 puDst->ai64[1] = uSrc1.ai32[1];
13639 puDst->ai64[2] = uSrc1.ai32[2];
13640 puDst->ai64[3] = uSrc1.ai32[3];
13641}
13642
13643
13644/*
13645 * PMOVZXBW / VPMOVZXBW
13646 */
13647IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13648{
13649 RTUINT64U uSrc1 = { uSrc };
13650 puDst->au16[0] = uSrc1.au8[0];
13651 puDst->au16[1] = uSrc1.au8[1];
13652 puDst->au16[2] = uSrc1.au8[2];
13653 puDst->au16[3] = uSrc1.au8[3];
13654 puDst->au16[4] = uSrc1.au8[4];
13655 puDst->au16[5] = uSrc1.au8[5];
13656 puDst->au16[6] = uSrc1.au8[6];
13657 puDst->au16[7] = uSrc1.au8[7];
13658}
13659
13660
13661IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13662{
13663 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13664 puDst->au16[ 0] = uSrc1.au8[ 0];
13665 puDst->au16[ 1] = uSrc1.au8[ 1];
13666 puDst->au16[ 2] = uSrc1.au8[ 2];
13667 puDst->au16[ 3] = uSrc1.au8[ 3];
13668 puDst->au16[ 4] = uSrc1.au8[ 4];
13669 puDst->au16[ 5] = uSrc1.au8[ 5];
13670 puDst->au16[ 6] = uSrc1.au8[ 6];
13671 puDst->au16[ 7] = uSrc1.au8[ 7];
13672 puDst->au16[ 8] = uSrc1.au8[ 8];
13673 puDst->au16[ 9] = uSrc1.au8[ 9];
13674 puDst->au16[10] = uSrc1.au8[10];
13675 puDst->au16[11] = uSrc1.au8[11];
13676 puDst->au16[12] = uSrc1.au8[12];
13677 puDst->au16[13] = uSrc1.au8[13];
13678 puDst->au16[14] = uSrc1.au8[14];
13679 puDst->au16[15] = uSrc1.au8[15];
13680}
13681
13682
13683/*
13684 * PMOVZXBD / VPMOVZXBD
13685 */
13686IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
13687{
13688 RTUINT32U uSrc1 = { uSrc };
13689 puDst->au32[0] = uSrc1.au8[0];
13690 puDst->au32[1] = uSrc1.au8[1];
13691 puDst->au32[2] = uSrc1.au8[2];
13692 puDst->au32[3] = uSrc1.au8[3];
13693}
13694
13695
13696IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13697{
13698 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13699 puDst->au32[0] = uSrc1.au8[0];
13700 puDst->au32[1] = uSrc1.au8[1];
13701 puDst->au32[2] = uSrc1.au8[2];
13702 puDst->au32[3] = uSrc1.au8[3];
13703 puDst->au32[4] = uSrc1.au8[4];
13704 puDst->au32[5] = uSrc1.au8[5];
13705 puDst->au32[6] = uSrc1.au8[6];
13706 puDst->au32[7] = uSrc1.au8[7];
13707}
13708
13709
13710/*
13711 * PMOVZXBQ / VPMOVZXBQ
13712 */
13713IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
13714{
13715 RTUINT16U uSrc1 = { uSrc };
13716 puDst->au64[0] = uSrc1.au8[0];
13717 puDst->au64[1] = uSrc1.au8[1];
13718}
13719
13720
13721IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13722{
13723 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13724 puDst->au64[0] = uSrc1.au8[0];
13725 puDst->au64[1] = uSrc1.au8[1];
13726 puDst->au64[2] = uSrc1.au8[2];
13727 puDst->au64[3] = uSrc1.au8[3];
13728}
13729
13730
13731/*
13732 * PMOVZXWD / VPMOVZXWD
13733 */
13734IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13735{
13736 RTUINT64U uSrc1 = { uSrc };
13737 puDst->au32[0] = uSrc1.au16[0];
13738 puDst->au32[1] = uSrc1.au16[1];
13739 puDst->au32[2] = uSrc1.au16[2];
13740 puDst->au32[3] = uSrc1.au16[3];
13741}
13742
13743
13744IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13745{
13746 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13747 puDst->au32[0] = uSrc1.au16[0];
13748 puDst->au32[1] = uSrc1.au16[1];
13749 puDst->au32[2] = uSrc1.au16[2];
13750 puDst->au32[3] = uSrc1.au16[3];
13751 puDst->au32[4] = uSrc1.au16[4];
13752 puDst->au32[5] = uSrc1.au16[5];
13753 puDst->au32[6] = uSrc1.au16[6];
13754 puDst->au32[7] = uSrc1.au16[7];
13755}
13756
13757
13758/*
13759 * PMOVZXWQ / VPMOVZXWQ
13760 */
13761IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
13762{
13763 RTUINT32U uSrc1 = { uSrc };
13764 puDst->au64[0] = uSrc1.au16[0];
13765 puDst->au64[1] = uSrc1.au16[1];
13766}
13767
13768
13769IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13770{
13771 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13772 puDst->au64[0] = uSrc1.au16[0];
13773 puDst->au64[1] = uSrc1.au16[1];
13774 puDst->au64[2] = uSrc1.au16[2];
13775 puDst->au64[3] = uSrc1.au16[3];
13776}
13777
13778
13779/*
13780 * PMOVZXDQ / VPMOVZXDQ
13781 */
13782IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13783{
13784 RTUINT64U uSrc1 = { uSrc };
13785 puDst->au64[0] = uSrc1.au32[0];
13786 puDst->au64[1] = uSrc1.au32[1];
13787}
13788
13789
13790IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13791{
13792 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13793 puDst->au64[0] = uSrc1.au32[0];
13794 puDst->au64[1] = uSrc1.au32[1];
13795 puDst->au64[2] = uSrc1.au32[2];
13796 puDst->au64[3] = uSrc1.au32[3];
13797}
13798
13799
13800#ifdef IEM_WITHOUT_ASSEMBLY
13801/**
13802 * Converts from the packed IPRT 32-bit (single precision) floating point format to
13803 * the SoftFloat 32-bit floating point format (float32_t).
13804 *
13805 * This is only a structure format conversion, nothing else.
13806 */
13807DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
13808{
13809 float32_t Tmp;
13810 Tmp.v = pr32Val->u;
13811 return Tmp;
13812}
13813
13814
13815/**
13816 * Converts from SoftFloat 32-bit floating point format (float32_t)
13817 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
13818 *
13819 * This is only a structure format conversion, nothing else.
13820 */
13821DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
13822{
13823 pr32Dst->u = r32XSrc.v;
13824 return pr32Dst;
13825}
13826
13827
13828/**
13829 * Converts from the packed IPRT 64-bit (single precision) floating point format to
13830 * the SoftFloat 64-bit floating point format (float64_t).
13831 *
13832 * This is only a structure format conversion, nothing else.
13833 */
13834DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
13835{
13836 float64_t Tmp;
13837 Tmp.v = pr64Val->u;
13838 return Tmp;
13839}
13840
13841
13842/**
13843 * Converts from SoftFloat 64-bit floating point format (float64_t)
13844 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
13845 *
13846 * This is only a structure format conversion, nothing else.
13847 */
13848DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
13849{
13850 pr64Dst->u = r64XSrc.v;
13851 return pr64Dst;
13852}
13853
13854
13855/** Initializer for the SoftFloat state structure. */
13856# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
13857 { \
13858 softfloat_tininess_afterRounding, \
13859 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
13860 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
13861 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
13862 : (uint8_t)softfloat_round_minMag, \
13863 0, \
13864 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
13865 32 /* Rounding precision, not relevant for SIMD. */ \
13866 }
13867
13868
13869/**
13870 * Helper for transfering exception to MXCSR and setting the result value
13871 * accordingly.
13872 *
13873 * @returns Updated MXCSR.
13874 * @param pSoftState The SoftFloat state following the operation.
13875 * @param r32Result The result of the SoftFloat operation.
13876 * @param pr32Result Where to store the result for IEM.
13877 * @param fMxcsr The original MXCSR value.
13878 */
13879DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
13880 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
13881{
13882 iemFpSoftF32ToIprt(pr32Result, r32Result);
13883
13884 uint8_t fXcpt = pSoftState->exceptionFlags;
13885 if ( (fMxcsr & X86_MXCSR_FZ)
13886 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
13887 {
13888 /* Underflow masked and flush to zero is set. */
13889 pr32Result->s.uFraction = 0;
13890 pr32Result->s.uExponent = 0;
13891 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
13892 }
13893
13894 /* If DAZ is set \#DE is never set. */
13895 if ( fMxcsr & X86_MXCSR_DAZ
13896 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
13897 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
13898 fXcpt &= ~X86_MXCSR_DE;
13899
13900 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
13901}
13902
13903
13904/**
13905 * Helper for transfering exception to MXCSR and setting the result value
13906 * accordingly - ignores Flush-to-Zero.
13907 *
13908 * @returns Updated MXCSR.
13909 * @param pSoftState The SoftFloat state following the operation.
13910 * @param r32Result The result of the SoftFloat operation.
13911 * @param pr32Result Where to store the result for IEM.
13912 * @param fMxcsr The original MXCSR value.
13913 */
13914DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
13915 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
13916{
13917 iemFpSoftF32ToIprt(pr32Result, r32Result);
13918
13919 uint8_t fXcpt = pSoftState->exceptionFlags;
13920 /* If DAZ is set \#DE is never set. */
13921 if ( fMxcsr & X86_MXCSR_DAZ
13922 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
13923 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
13924 fXcpt &= ~X86_MXCSR_DE;
13925
13926 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
13927}
13928
13929
13930/**
13931 * Helper for transfering exception to MXCSR and setting the result value
13932 * accordingly.
13933 *
13934 * @returns Updated MXCSR.
13935 * @param pSoftState The SoftFloat state following the operation.
13936 * @param r64Result The result of the SoftFloat operation.
13937 * @param pr64Result Where to store the result for IEM.
13938 * @param fMxcsr The original MXCSR value.
13939 */
13940DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
13941 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
13942{
13943 iemFpSoftF64ToIprt(pr64Result, r64Result);
13944 uint8_t fXcpt = pSoftState->exceptionFlags;
13945 if ( (fMxcsr & X86_MXCSR_FZ)
13946 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
13947 {
13948 /* Underflow masked and flush to zero is set. */
13949 iemFpSoftF64ToIprt(pr64Result, r64Result);
13950 pr64Result->s.uFractionHigh = 0;
13951 pr64Result->s.uFractionLow = 0;
13952 pr64Result->s.uExponent = 0;
13953 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
13954 }
13955
13956 /* If DAZ is set \#DE is never set. */
13957 if ( fMxcsr & X86_MXCSR_DAZ
13958 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
13959 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
13960 fXcpt &= ~X86_MXCSR_DE;
13961
13962 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
13963}
13964
13965
13966/**
13967 * Helper for transfering exception to MXCSR and setting the result value
13968 * accordingly - ignores Flush-to-Zero.
13969 *
13970 * @returns Updated MXCSR.
13971 * @param pSoftState The SoftFloat state following the operation.
13972 * @param r64Result The result of the SoftFloat operation.
13973 * @param pr64Result Where to store the result for IEM.
13974 * @param fMxcsr The original MXCSR value.
13975 */
13976DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
13977 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
13978{
13979 iemFpSoftF64ToIprt(pr64Result, r64Result);
13980
13981 uint8_t fXcpt = pSoftState->exceptionFlags;
13982 /* If DAZ is set \#DE is never set. */
13983 if ( fMxcsr & X86_MXCSR_DAZ
13984 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
13985 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
13986 fXcpt &= ~X86_MXCSR_DE;
13987
13988 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
13989}
13990
13991
13992/**
13993 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
13994 * in MXCSR into account.
13995 *
13996 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
13997 * @param pr32Val Where to store the result.
13998 * @param fMxcsr The input MXCSR value.
13999 * @param pr32Src The value to use.
14000 */
14001DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14002{
14003 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14004 {
14005 if (fMxcsr & X86_MXCSR_DAZ)
14006 {
14007 /* De-normals are changed to 0. */
14008 pr32Val->s.fSign = pr32Src->s.fSign;
14009 pr32Val->s.uFraction = 0;
14010 pr32Val->s.uExponent = 0;
14011 return 0;
14012 }
14013
14014 *pr32Val = *pr32Src;
14015 return X86_MXCSR_DE;
14016 }
14017
14018 *pr32Val = *pr32Src;
14019 return 0;
14020}
14021
14022
14023/**
14024 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14025 * in MXCSR into account.
14026 *
14027 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14028 * @param pr64Val Where to store the result.
14029 * @param fMxcsr The input MXCSR value.
14030 * @param pr64Src The value to use.
14031 */
14032DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14033{
14034 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14035 {
14036 if (fMxcsr & X86_MXCSR_DAZ)
14037 {
14038 /* De-normals are changed to 0. */
14039 pr64Val->s64.fSign = pr64Src->s.fSign;
14040 pr64Val->s64.uFraction = 0;
14041 pr64Val->s64.uExponent = 0;
14042 return 0;
14043 }
14044
14045 *pr64Val = *pr64Src;
14046 return X86_MXCSR_DE;
14047 }
14048
14049 *pr64Val = *pr64Src;
14050 return 0;
14051}
14052
14053
14054/**
14055 * Validates the given input operands returning whether the operation can continue or whether one
14056 * of the source operands contains a NaN value, setting the output accordingly.
14057 *
14058 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14059 * @param pr32Res Where to store the result in case the operation can't continue.
14060 * @param pr32Val1 The first input operand.
14061 * @param pr32Val2 The second input operand.
14062 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14063 */
14064DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14065{
14066 uint8_t cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14067 uint8_t cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14068 if (cSNan + cQNan == 2)
14069 {
14070 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14071 *pr32Res = *pr32Val1;
14072 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14073 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14074 return true;
14075 }
14076 else if (cSNan)
14077 {
14078 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14079 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14080 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14081 *pfMxcsr |= X86_MXCSR_IE;
14082 return true;
14083 }
14084 else if (cQNan)
14085 {
14086 /* The QNan operand is placed into the result. */
14087 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14088 return true;
14089 }
14090
14091 Assert(!cQNan && !cSNan);
14092 return false;
14093}
14094
14095
14096/**
14097 * Validates the given double precision input operands returning whether the operation can continue or whether one
14098 * of the source operands contains a NaN value, setting the output accordingly.
14099 *
14100 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14101 * @param pr64Res Where to store the result in case the operation can't continue.
14102 * @param pr64Val1 The first input operand.
14103 * @param pr64Val2 The second input operand.
14104 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14105 */
14106DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14107{
14108 uint8_t cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14109 uint8_t cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14110 if (cSNan + cQNan == 2)
14111 {
14112 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14113 *pr64Res = *pr64Val1;
14114 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14115 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14116 return true;
14117 }
14118 else if (cSNan)
14119 {
14120 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14121 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14122 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14123 *pfMxcsr |= X86_MXCSR_IE;
14124 return true;
14125 }
14126 else if (cQNan)
14127 {
14128 /* The QNan operand is placed into the result. */
14129 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14130 return true;
14131 }
14132
14133 Assert(!cQNan && !cSNan);
14134 return false;
14135}
14136#endif
14137
14138
14139/**
14140 * ADDPS
14141 */
14142#ifdef IEM_WITHOUT_ASSEMBLY
14143static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14144{
14145 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14146 return fMxcsr;
14147
14148 RTFLOAT32U r32Src1, r32Src2;
14149 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14150 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14151 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14152 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14153 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14154}
14155
14156
14157IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14158{
14159 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14160 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14161 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14162 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14163}
14164#endif
14165
14166
14167/**
14168 * ADDSS
14169 */
14170#ifdef IEM_WITHOUT_ASSEMBLY
14171IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14172{
14173 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14174 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14175 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14176 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14177}
14178#endif
14179
14180
14181/**
14182 * ADDPD
14183 */
14184#ifdef IEM_WITHOUT_ASSEMBLY
14185static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14186{
14187 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14188 return fMxcsr;
14189
14190 RTFLOAT64U r64Src1, r64Src2;
14191 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14192 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14193 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14194 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14195 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14196}
14197
14198
14199IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14200{
14201 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14202 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14203}
14204#endif
14205
14206
14207/**
14208 * ADDSD
14209 */
14210#ifdef IEM_WITHOUT_ASSEMBLY
14211IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14212{
14213 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14214 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14215}
14216#endif
14217
14218
14219/**
14220 * MULPS
14221 */
14222#ifdef IEM_WITHOUT_ASSEMBLY
14223static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14224{
14225 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14226 return fMxcsr;
14227
14228 RTFLOAT32U r32Src1, r32Src2;
14229 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14230 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14231 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14232 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14233 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14234}
14235
14236
14237IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14238{
14239 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14240 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14241 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14242 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14243}
14244#endif
14245
14246
14247/**
14248 * MULSS
14249 */
14250#ifdef IEM_WITHOUT_ASSEMBLY
14251IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14252{
14253 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14254 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14255 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14256 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14257}
14258#endif
14259
14260
14261/**
14262 * MULPD
14263 */
14264#ifdef IEM_WITHOUT_ASSEMBLY
14265static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14266{
14267 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14268 return fMxcsr;
14269
14270 RTFLOAT64U r64Src1, r64Src2;
14271 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14272 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14273 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14274 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14275 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14276}
14277
14278
14279IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14280{
14281 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14282 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14283}
14284#endif
14285
14286
14287/**
14288 * MULSD
14289 */
14290#ifdef IEM_WITHOUT_ASSEMBLY
14291IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14292{
14293 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14294 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14295}
14296#endif
14297
14298
14299/**
14300 * SUBPS
14301 */
14302#ifdef IEM_WITHOUT_ASSEMBLY
14303static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14304{
14305 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14306 return fMxcsr;
14307
14308 RTFLOAT32U r32Src1, r32Src2;
14309 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14310 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14311 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14312 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14313 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14314}
14315
14316
14317IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14318{
14319 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14320 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14321 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14322 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14323}
14324#endif
14325
14326
14327/**
14328 * SUBSS
14329 */
14330#ifdef IEM_WITHOUT_ASSEMBLY
14331IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14332{
14333 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14334 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14335 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14336 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14337}
14338#endif
14339
14340
14341/**
14342 * SUBPD
14343 */
14344#ifdef IEM_WITHOUT_ASSEMBLY
14345static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14346{
14347 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14348 return fMxcsr;
14349
14350 RTFLOAT64U r64Src1, r64Src2;
14351 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14352 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14353 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14354 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14355 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14356}
14357
14358
14359IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14360{
14361 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14362 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14363}
14364#endif
14365
14366
14367/**
14368 * SUBSD
14369 */
14370#ifdef IEM_WITHOUT_ASSEMBLY
14371IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14372{
14373 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14374 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14375}
14376#endif
14377
14378
14379/**
14380 * MINPS
14381 */
14382#ifdef IEM_WITHOUT_ASSEMBLY
14383static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14384{
14385 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
14386 {
14387 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14388 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
14389 return fMxcsr | X86_MXCSR_IE;
14390 }
14391
14392 RTFLOAT32U r32Src1, r32Src2;
14393 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14394 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14395 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
14396 {
14397 *pr32Res = r32Src2;
14398 return fMxcsr;
14399 }
14400
14401 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14402 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14403 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
14404 fLe
14405 ? iemFpSoftF32FromIprt(&r32Src1)
14406 : iemFpSoftF32FromIprt(&r32Src2),
14407 pr32Res, fMxcsr);
14408}
14409
14410
14411IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14412{
14413 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14414 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14415 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14416 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14417}
14418#endif
14419
14420
14421/**
14422 * MINSS
14423 */
14424#ifdef IEM_WITHOUT_ASSEMBLY
14425IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14426{
14427 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14428 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14429 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14430 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14431}
14432#endif
14433
14434
14435/**
14436 * MINPD
14437 */
14438#ifdef IEM_WITHOUT_ASSEMBLY
14439static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14440{
14441 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
14442 {
14443 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14444 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
14445 return fMxcsr | X86_MXCSR_IE;
14446 }
14447
14448 RTFLOAT64U r64Src1, r64Src2;
14449 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14450 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14451 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
14452 {
14453 *pr64Res = r64Src2;
14454 return fMxcsr;
14455 }
14456
14457 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14458 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14459 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
14460 fLe
14461 ? iemFpSoftF64FromIprt(&r64Src1)
14462 : iemFpSoftF64FromIprt(&r64Src2),
14463 pr64Res, fMxcsr);
14464}
14465
14466
14467IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14468{
14469 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14470 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14471}
14472#endif
14473
14474
14475/**
14476 * MINSD
14477 */
14478#ifdef IEM_WITHOUT_ASSEMBLY
14479IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14480{
14481 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14482 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14483}
14484#endif
14485
14486
14487/**
14488 * DIVPS
14489 */
14490#ifdef IEM_WITHOUT_ASSEMBLY
14491static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14492{
14493 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14494 return fMxcsr;
14495
14496 RTFLOAT32U r32Src1, r32Src2;
14497 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14498 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14499 if (RTFLOAT32U_IS_ZERO(&r32Src2))
14500 {
14501 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
14502 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
14503 {
14504 *pr32Res = g_ar32QNaN[1];
14505 return fMxcsr | X86_MXCSR_IE;
14506 }
14507 else if (RTFLOAT32U_IS_INF(&r32Src1))
14508 {
14509 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
14510 return fMxcsr;
14511 }
14512 else
14513 {
14514 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
14515 return fMxcsr | X86_MXCSR_ZE;
14516 }
14517 }
14518
14519 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14520 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14521 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
14522}
14523
14524
14525IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14526{
14527 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14528 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14529 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14530 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14531}
14532#endif
14533
14534
14535/**
14536 * DIVSS
14537 */
14538#ifdef IEM_WITHOUT_ASSEMBLY
14539IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14540{
14541 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14542 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14543 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14544 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14545}
14546#endif
14547
14548
14549/**
14550 * DIVPD
14551 */
14552#ifdef IEM_WITHOUT_ASSEMBLY
14553static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14554{
14555 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14556 return fMxcsr;
14557
14558 RTFLOAT64U r64Src1, r64Src2;
14559 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14560 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14561 if (RTFLOAT64U_IS_ZERO(&r64Src2))
14562 {
14563 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
14564 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
14565 {
14566 *pr64Res = g_ar64QNaN[1];
14567 return fMxcsr | X86_MXCSR_IE;
14568 }
14569 else if (RTFLOAT64U_IS_INF(&r64Src1))
14570 {
14571 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
14572 return fMxcsr;
14573 }
14574 else
14575 {
14576 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
14577 return fMxcsr | X86_MXCSR_ZE;
14578 }
14579 }
14580
14581 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14582 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14583 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
14584}
14585
14586
14587IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14588{
14589 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14590 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14591}
14592#endif
14593
14594
14595/**
14596 * DIVSD
14597 */
14598#ifdef IEM_WITHOUT_ASSEMBLY
14599IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14600{
14601 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14602 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14603}
14604#endif
14605
14606
14607/**
14608 * MAXPS
14609 */
14610#ifdef IEM_WITHOUT_ASSEMBLY
14611static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14612{
14613 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
14614 {
14615 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14616 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
14617 return fMxcsr | X86_MXCSR_IE;
14618 }
14619
14620 RTFLOAT32U r32Src1, r32Src2;
14621 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14622 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14623 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
14624 {
14625 *pr32Res = r32Src2;
14626 return fMxcsr;
14627 }
14628
14629 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14630 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14631 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
14632 fLe
14633 ? iemFpSoftF32FromIprt(&r32Src2)
14634 : iemFpSoftF32FromIprt(&r32Src1),
14635 pr32Res, fMxcsr);
14636}
14637
14638
14639IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14640{
14641 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14642 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14643 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14644 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14645}
14646#endif
14647
14648
14649/**
14650 * MAXSS
14651 */
14652#ifdef IEM_WITHOUT_ASSEMBLY
14653IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14654{
14655 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14656 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14657 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14658 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14659}
14660#endif
14661
14662
14663/**
14664 * MAXPD
14665 */
14666#ifdef IEM_WITHOUT_ASSEMBLY
14667static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14668{
14669 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
14670 {
14671 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14672 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
14673 return fMxcsr | X86_MXCSR_IE;
14674 }
14675
14676 RTFLOAT64U r64Src1, r64Src2;
14677 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14678 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14679 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
14680 {
14681 *pr64Res = r64Src2;
14682 return fMxcsr;
14683 }
14684
14685 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14686 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14687 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
14688 fLe
14689 ? iemFpSoftF64FromIprt(&r64Src2)
14690 : iemFpSoftF64FromIprt(&r64Src1),
14691 pr64Res, fMxcsr);
14692}
14693
14694
14695IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14696{
14697 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14698 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14699}
14700#endif
14701
14702
14703/**
14704 * MAXSD
14705 */
14706#ifdef IEM_WITHOUT_ASSEMBLY
14707IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14708{
14709 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14710 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14711}
14712#endif
14713
14714
14715/**
14716 * CVTSS2SD
14717 */
14718#ifdef IEM_WITHOUT_ASSEMBLY
14719static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
14720{
14721 RTFLOAT32U r32Src1;
14722 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14723
14724 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14725 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
14726 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14727}
14728
14729
14730IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14731{
14732 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
14733 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14734}
14735#endif
14736
14737
14738/**
14739 * CVTSD2SS
14740 */
14741#ifdef IEM_WITHOUT_ASSEMBLY
14742static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
14743{
14744 RTFLOAT64U r64Src1;
14745 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14746
14747 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14748 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
14749 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14750}
14751
14752
14753IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14754{
14755 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
14756 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14757 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14758 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14759}
14760#endif
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette