VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 102917

Last change on this file since 102917 was 102896, checked in by vboxsync, 14 months ago

VMM/IEM: Use standard binary assembly helper signature for ADCX and ADOX. Added them to tstIEMAImpl. bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 712.3 KB
Line 
1/* $Id: IEMAllAImplC.cpp 102896 2024-01-16 12:23:05Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT32U g_ar32One[];
464extern const RTFLOAT80U g_ar80One[];
465extern const RTFLOAT80U g_r80Indefinite;
466extern const RTFLOAT32U g_ar32Infinity[];
467extern const RTFLOAT64U g_ar64Infinity[];
468extern const RTFLOAT80U g_ar80Infinity[];
469extern const RTFLOAT128U g_r128Ln2;
470extern const RTUINT128U g_u128Ln2Mantissa;
471extern const RTUINT128U g_u128Ln2MantissaIntel;
472extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
473extern const RTFLOAT32U g_ar32QNaN[];
474extern const RTFLOAT64U g_ar64QNaN[];
475
476/** Zero values (indexed by fSign). */
477RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
478RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
479RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
480
481/** One values (indexed by fSign). */
482RTFLOAT32U const g_ar32One[] =
483{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
484RTFLOAT80U const g_ar80One[] =
485{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
486
487/** Indefinite (negative). */
488RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
489
490/** Infinities (indexed by fSign). */
491RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
492RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
493RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
494
495/** Default QNaNs (indexed by fSign). */
496RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
497RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
498
499
500#if 0
501/** 128-bit floating point constant: 2.0 */
502const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
503#endif
504
505
506/* The next section is generated by tools/IEMGenFpuConstants: */
507
508/** The ln2 constant as 128-bit floating point value.
509 * base-10: 6.93147180559945309417232121458176575e-1
510 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
511 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
512//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
513const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
514/** High precision ln2 value.
515 * base-10: 6.931471805599453094172321214581765680747e-1
516 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
517 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
518const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
519/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
520 * base-10: 6.931471805599453094151379470289064954613e-1
521 * base-16: b.17217f7d1cf79abc0000000000000000@-1
522 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
523const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
524
525/** Horner constants for f2xm1 */
526const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
527{
528 /* a0
529 * base-10: 1.00000000000000000000000000000000000e0
530 * base-16: 1.0000000000000000000000000000@0
531 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
532 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
533 /* a1
534 * base-10: 5.00000000000000000000000000000000000e-1
535 * base-16: 8.0000000000000000000000000000@-1
536 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
537 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
538 /* a2
539 * base-10: 1.66666666666666666666666666666666658e-1
540 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
541 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
542 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
543 /* a3
544 * base-10: 4.16666666666666666666666666666666646e-2
545 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
546 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
547 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
548 /* a4
549 * base-10: 8.33333333333333333333333333333333323e-3
550 * base-16: 2.2222222222222222222222222222@-2
551 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
552 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
553 /* a5
554 * base-10: 1.38888888888888888888888888888888874e-3
555 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
556 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
557 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
558 /* a6
559 * base-10: 1.98412698412698412698412698412698412e-4
560 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
561 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
562 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
563 /* a7
564 * base-10: 2.48015873015873015873015873015873015e-5
565 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
566 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
567 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
568 /* a8
569 * base-10: 2.75573192239858906525573192239858902e-6
570 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
571 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
572 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
573 /* a9
574 * base-10: 2.75573192239858906525573192239858865e-7
575 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
576 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
577 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
578 /* a10
579 * base-10: 2.50521083854417187750521083854417184e-8
580 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
581 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
582 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
583 /* a11
584 * base-10: 2.08767569878680989792100903212014296e-9
585 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
586 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
587 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
588 /* a12
589 * base-10: 1.60590438368216145993923771701549472e-10
590 * base-16: b.092309d43684be51c198e91d7b40@-9
591 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
592 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
593 /* a13
594 * base-10: 1.14707455977297247138516979786821043e-11
595 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
596 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
597 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
598 /* a14
599 * base-10: 7.64716373181981647590113198578806964e-13
600 * base-16: d.73f9f399dc0f88ec32b587746578@-11
601 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
602 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
603 /* a15
604 * base-10: 4.77947733238738529743820749111754352e-14
605 * base-16: d.73f9f399dc0f88ec32b587746578@-12
606 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
607 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
608 /* a16
609 * base-10: 2.81145725434552076319894558301031970e-15
610 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
611 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
612 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
613 /* a17
614 * base-10: 1.56192069685862264622163643500573321e-16
615 * base-16: b.413c31dcbecbbdd8024435161550@-14
616 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
617 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
618 /* a18
619 * base-10: 8.22063524662432971695598123687227980e-18
620 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
621 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
622 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
623 /* a19
624 * base-10: 4.11031762331216485847799061843614006e-19
625 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
626 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
627 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
628 /* a20
629 * base-10: 1.95729410633912612308475743735054143e-20
630 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
631 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
632 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
633 /* a21
634 * base-10: 8.89679139245057328674889744250246106e-22
635 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
636 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
637 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
638};
639
640
641/*
642 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
643 * it all in C is probably safer atm., optimize what's necessary later, maybe.
644 */
645#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
646
647
648/*********************************************************************************************************************************
649* Binary Operations *
650*********************************************************************************************************************************/
651
652/*
653 * ADD
654 */
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
657{
658 uint64_t uDst = *puDst;
659 uint64_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
662}
663
664# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
665
666IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
667{
668 uint32_t uDst = *puDst;
669 uint32_t uResult = uDst + uSrc;
670 *puDst = uResult;
671 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
672}
673
674
675IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
676{
677 uint16_t uDst = *puDst;
678 uint16_t uResult = uDst + uSrc;
679 *puDst = uResult;
680 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
681}
682
683
684IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
685{
686 uint8_t uDst = *puDst;
687 uint8_t uResult = uDst + uSrc;
688 *puDst = uResult;
689 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
690}
691
692# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
693
694/*
695 * ADC
696 */
697
698IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
699{
700 if (!(*pfEFlags & X86_EFL_CF))
701 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
702 else
703 {
704 uint64_t uDst = *puDst;
705 uint64_t uResult = uDst + uSrc + 1;
706 *puDst = uResult;
707 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
708 }
709}
710
711# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint32_t uDst = *puDst;
720 uint32_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
723 }
724}
725
726
727IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
728{
729 if (!(*pfEFlags & X86_EFL_CF))
730 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
731 else
732 {
733 uint16_t uDst = *puDst;
734 uint16_t uResult = uDst + uSrc + 1;
735 *puDst = uResult;
736 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
737 }
738}
739
740
741IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
742{
743 if (!(*pfEFlags & X86_EFL_CF))
744 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
745 else
746 {
747 uint8_t uDst = *puDst;
748 uint8_t uResult = uDst + uSrc + 1;
749 *puDst = uResult;
750 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
751 }
752}
753
754# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
755
756/*
757 * SUB
758 */
759
760IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
761{
762 uint64_t uDst = *puDst;
763 uint64_t uResult = uDst - uSrc;
764 *puDst = uResult;
765 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
766}
767
768# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
769
770IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
771{
772 uint32_t uDst = *puDst;
773 uint32_t uResult = uDst - uSrc;
774 *puDst = uResult;
775 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
776}
777
778
779IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
780{
781 uint16_t uDst = *puDst;
782 uint16_t uResult = uDst - uSrc;
783 *puDst = uResult;
784 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
785}
786
787
788IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
789{
790 uint8_t uDst = *puDst;
791 uint8_t uResult = uDst - uSrc;
792 *puDst = uResult;
793 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
794}
795
796# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
797
798/*
799 * SBB
800 */
801
802IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
803{
804 if (!(*pfEFlags & X86_EFL_CF))
805 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
806 else
807 {
808 uint64_t uDst = *puDst;
809 uint64_t uResult = uDst - uSrc - 1;
810 *puDst = uResult;
811 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
812 }
813}
814
815# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
816
817IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
818{
819 if (!(*pfEFlags & X86_EFL_CF))
820 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
821 else
822 {
823 uint32_t uDst = *puDst;
824 uint32_t uResult = uDst - uSrc - 1;
825 *puDst = uResult;
826 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
827 }
828}
829
830
831IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
832{
833 if (!(*pfEFlags & X86_EFL_CF))
834 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
835 else
836 {
837 uint16_t uDst = *puDst;
838 uint16_t uResult = uDst - uSrc - 1;
839 *puDst = uResult;
840 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
841 }
842}
843
844
845IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
846{
847 if (!(*pfEFlags & X86_EFL_CF))
848 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
849 else
850 {
851 uint8_t uDst = *puDst;
852 uint8_t uResult = uDst - uSrc - 1;
853 *puDst = uResult;
854 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
855 }
856}
857
858# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
859
860
861/*
862 * OR
863 */
864
865IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
866{
867 uint64_t uResult = *puDst | uSrc;
868 *puDst = uResult;
869 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
870}
871
872# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
873
874IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
875{
876 uint32_t uResult = *puDst | uSrc;
877 *puDst = uResult;
878 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
879}
880
881
882IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
883{
884 uint16_t uResult = *puDst | uSrc;
885 *puDst = uResult;
886 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
887}
888
889
890IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
891{
892 uint8_t uResult = *puDst | uSrc;
893 *puDst = uResult;
894 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
895}
896
897# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
898
899/*
900 * XOR
901 */
902
903IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
904{
905 uint64_t uResult = *puDst ^ uSrc;
906 *puDst = uResult;
907 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
908}
909
910# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
911
912IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
913{
914 uint32_t uResult = *puDst ^ uSrc;
915 *puDst = uResult;
916 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
917}
918
919
920IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
921{
922 uint16_t uResult = *puDst ^ uSrc;
923 *puDst = uResult;
924 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
925}
926
927
928IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
929{
930 uint8_t uResult = *puDst ^ uSrc;
931 *puDst = uResult;
932 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
933}
934
935# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
936
937/*
938 * AND
939 */
940
941IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
942{
943 uint64_t const uResult = *puDst & uSrc;
944 *puDst = uResult;
945 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
946}
947
948# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
949
950IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
951{
952 uint32_t const uResult = *puDst & uSrc;
953 *puDst = uResult;
954 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
955}
956
957
958IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
959{
960 uint16_t const uResult = *puDst & uSrc;
961 *puDst = uResult;
962 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
963}
964
965
966IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
967{
968 uint8_t const uResult = *puDst & uSrc;
969 *puDst = uResult;
970 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
971}
972
973# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
974#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
975
976/*
977 * ANDN (BMI1 instruction)
978 */
979
980IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
981{
982 uint64_t const uResult = ~uSrc1 & uSrc2;
983 *puDst = uResult;
984 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
985}
986
987
988IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
989{
990 uint32_t const uResult = ~uSrc1 & uSrc2;
991 *puDst = uResult;
992 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
993}
994
995
996#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
997IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
998{
999 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1000}
1001#endif
1002
1003
1004#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1005IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1006{
1007 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1008}
1009#endif
1010
1011#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1012
1013/*
1014 * CMP
1015 */
1016
1017IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1018{
1019 uint64_t uDstTmp = *puDst;
1020 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1021}
1022
1023# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1024
1025IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1026{
1027 uint32_t uDstTmp = *puDst;
1028 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1029}
1030
1031
1032IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1033{
1034 uint16_t uDstTmp = *puDst;
1035 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1036}
1037
1038
1039IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1040{
1041 uint8_t uDstTmp = *puDst;
1042 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1043}
1044
1045# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1046
1047/*
1048 * TEST
1049 */
1050
1051IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1052{
1053 uint64_t uResult = *puDst & uSrc;
1054 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1055}
1056
1057# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1058
1059IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1060{
1061 uint32_t uResult = *puDst & uSrc;
1062 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1063}
1064
1065
1066IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1067{
1068 uint16_t uResult = *puDst & uSrc;
1069 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1070}
1071
1072
1073IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1074{
1075 uint8_t uResult = *puDst & uSrc;
1076 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1077}
1078
1079# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1080
1081
1082/*
1083 * LOCK prefixed variants of the above
1084 */
1085
1086/** 64-bit locked binary operand operation. */
1087# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1088 do { \
1089 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1090 uint ## a_cBitsWidth ## _t uTmp; \
1091 uint32_t fEflTmp; \
1092 do \
1093 { \
1094 uTmp = uOld; \
1095 fEflTmp = *pfEFlags; \
1096 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1097 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1098 *pfEFlags = fEflTmp; \
1099 } while (0)
1100
1101
1102#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1103 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1104 uint ## a_cBitsWidth ## _t uSrc, \
1105 uint32_t *pfEFlags)) \
1106 { \
1107 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1108 }
1109
1110EMIT_LOCKED_BIN_OP(add, 64)
1111EMIT_LOCKED_BIN_OP(adc, 64)
1112EMIT_LOCKED_BIN_OP(sub, 64)
1113EMIT_LOCKED_BIN_OP(sbb, 64)
1114EMIT_LOCKED_BIN_OP(or, 64)
1115EMIT_LOCKED_BIN_OP(xor, 64)
1116EMIT_LOCKED_BIN_OP(and, 64)
1117# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1118EMIT_LOCKED_BIN_OP(add, 32)
1119EMIT_LOCKED_BIN_OP(adc, 32)
1120EMIT_LOCKED_BIN_OP(sub, 32)
1121EMIT_LOCKED_BIN_OP(sbb, 32)
1122EMIT_LOCKED_BIN_OP(or, 32)
1123EMIT_LOCKED_BIN_OP(xor, 32)
1124EMIT_LOCKED_BIN_OP(and, 32)
1125
1126EMIT_LOCKED_BIN_OP(add, 16)
1127EMIT_LOCKED_BIN_OP(adc, 16)
1128EMIT_LOCKED_BIN_OP(sub, 16)
1129EMIT_LOCKED_BIN_OP(sbb, 16)
1130EMIT_LOCKED_BIN_OP(or, 16)
1131EMIT_LOCKED_BIN_OP(xor, 16)
1132EMIT_LOCKED_BIN_OP(and, 16)
1133
1134EMIT_LOCKED_BIN_OP(add, 8)
1135EMIT_LOCKED_BIN_OP(adc, 8)
1136EMIT_LOCKED_BIN_OP(sub, 8)
1137EMIT_LOCKED_BIN_OP(sbb, 8)
1138EMIT_LOCKED_BIN_OP(or, 8)
1139EMIT_LOCKED_BIN_OP(xor, 8)
1140EMIT_LOCKED_BIN_OP(and, 8)
1141# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1142
1143
1144/*
1145 * Bit operations (same signature as above).
1146 */
1147
1148/*
1149 * BT
1150 */
1151
1152IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1153{
1154 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1155 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1156 Assert(uSrc < 64);
1157 uint64_t uDst = *puDst;
1158 if (uDst & RT_BIT_64(uSrc))
1159 *pfEFlags |= X86_EFL_CF;
1160 else
1161 *pfEFlags &= ~X86_EFL_CF;
1162}
1163
1164# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1165
1166IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1167{
1168 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1169 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1170 Assert(uSrc < 32);
1171 uint32_t uDst = *puDst;
1172 if (uDst & RT_BIT_32(uSrc))
1173 *pfEFlags |= X86_EFL_CF;
1174 else
1175 *pfEFlags &= ~X86_EFL_CF;
1176}
1177
1178IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1179{
1180 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1181 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1182 Assert(uSrc < 16);
1183 uint16_t uDst = *puDst;
1184 if (uDst & RT_BIT_32(uSrc))
1185 *pfEFlags |= X86_EFL_CF;
1186 else
1187 *pfEFlags &= ~X86_EFL_CF;
1188}
1189
1190# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1191
1192/*
1193 * BTC
1194 */
1195
1196IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1197{
1198 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1199 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1200 Assert(uSrc < 64);
1201 uint64_t fMask = RT_BIT_64(uSrc);
1202 uint64_t uDst = *puDst;
1203 if (uDst & fMask)
1204 {
1205 uDst &= ~fMask;
1206 *puDst = uDst;
1207 *pfEFlags |= X86_EFL_CF;
1208 }
1209 else
1210 {
1211 uDst |= fMask;
1212 *puDst = uDst;
1213 *pfEFlags &= ~X86_EFL_CF;
1214 }
1215}
1216
1217# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1218
1219IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1220{
1221 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1222 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1223 Assert(uSrc < 32);
1224 uint32_t fMask = RT_BIT_32(uSrc);
1225 uint32_t uDst = *puDst;
1226 if (uDst & fMask)
1227 {
1228 uDst &= ~fMask;
1229 *puDst = uDst;
1230 *pfEFlags |= X86_EFL_CF;
1231 }
1232 else
1233 {
1234 uDst |= fMask;
1235 *puDst = uDst;
1236 *pfEFlags &= ~X86_EFL_CF;
1237 }
1238}
1239
1240
1241IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1242{
1243 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1244 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1245 Assert(uSrc < 16);
1246 uint16_t fMask = RT_BIT_32(uSrc);
1247 uint16_t uDst = *puDst;
1248 if (uDst & fMask)
1249 {
1250 uDst &= ~fMask;
1251 *puDst = uDst;
1252 *pfEFlags |= X86_EFL_CF;
1253 }
1254 else
1255 {
1256 uDst |= fMask;
1257 *puDst = uDst;
1258 *pfEFlags &= ~X86_EFL_CF;
1259 }
1260}
1261
1262# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1263
1264/*
1265 * BTR
1266 */
1267
1268IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1269{
1270 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1271 logical operation (AND/OR/whatever). */
1272 Assert(uSrc < 64);
1273 uint64_t fMask = RT_BIT_64(uSrc);
1274 uint64_t uDst = *puDst;
1275 if (uDst & fMask)
1276 {
1277 uDst &= ~fMask;
1278 *puDst = uDst;
1279 *pfEFlags |= X86_EFL_CF;
1280 }
1281 else
1282 *pfEFlags &= ~X86_EFL_CF;
1283}
1284
1285# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1286
1287IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1288{
1289 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1290 logical operation (AND/OR/whatever). */
1291 Assert(uSrc < 32);
1292 uint32_t fMask = RT_BIT_32(uSrc);
1293 uint32_t uDst = *puDst;
1294 if (uDst & fMask)
1295 {
1296 uDst &= ~fMask;
1297 *puDst = uDst;
1298 *pfEFlags |= X86_EFL_CF;
1299 }
1300 else
1301 *pfEFlags &= ~X86_EFL_CF;
1302}
1303
1304
1305IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1306{
1307 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1308 logical operation (AND/OR/whatever). */
1309 Assert(uSrc < 16);
1310 uint16_t fMask = RT_BIT_32(uSrc);
1311 uint16_t uDst = *puDst;
1312 if (uDst & fMask)
1313 {
1314 uDst &= ~fMask;
1315 *puDst = uDst;
1316 *pfEFlags |= X86_EFL_CF;
1317 }
1318 else
1319 *pfEFlags &= ~X86_EFL_CF;
1320}
1321
1322# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1323
1324/*
1325 * BTS
1326 */
1327
1328IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1329{
1330 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1331 logical operation (AND/OR/whatever). */
1332 Assert(uSrc < 64);
1333 uint64_t fMask = RT_BIT_64(uSrc);
1334 uint64_t uDst = *puDst;
1335 if (uDst & fMask)
1336 *pfEFlags |= X86_EFL_CF;
1337 else
1338 {
1339 uDst |= fMask;
1340 *puDst = uDst;
1341 *pfEFlags &= ~X86_EFL_CF;
1342 }
1343}
1344
1345# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1346
1347IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1348{
1349 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1350 logical operation (AND/OR/whatever). */
1351 Assert(uSrc < 32);
1352 uint32_t fMask = RT_BIT_32(uSrc);
1353 uint32_t uDst = *puDst;
1354 if (uDst & fMask)
1355 *pfEFlags |= X86_EFL_CF;
1356 else
1357 {
1358 uDst |= fMask;
1359 *puDst = uDst;
1360 *pfEFlags &= ~X86_EFL_CF;
1361 }
1362}
1363
1364
1365IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1366{
1367 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1368 logical operation (AND/OR/whatever). */
1369 Assert(uSrc < 16);
1370 uint16_t fMask = RT_BIT_32(uSrc);
1371 uint32_t uDst = *puDst;
1372 if (uDst & fMask)
1373 *pfEFlags |= X86_EFL_CF;
1374 else
1375 {
1376 uDst |= fMask;
1377 *puDst = uDst;
1378 *pfEFlags &= ~X86_EFL_CF;
1379 }
1380}
1381
1382# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1383
1384
1385EMIT_LOCKED_BIN_OP(btc, 64)
1386EMIT_LOCKED_BIN_OP(btr, 64)
1387EMIT_LOCKED_BIN_OP(bts, 64)
1388# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1389EMIT_LOCKED_BIN_OP(btc, 32)
1390EMIT_LOCKED_BIN_OP(btr, 32)
1391EMIT_LOCKED_BIN_OP(bts, 32)
1392
1393EMIT_LOCKED_BIN_OP(btc, 16)
1394EMIT_LOCKED_BIN_OP(btr, 16)
1395EMIT_LOCKED_BIN_OP(bts, 16)
1396# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1397
1398
1399/*
1400 * Helpers for BSR and BSF.
1401 *
1402 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1403 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1404 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1405 * but we restrict ourselves to emulating these recent marchs.
1406 */
1407#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1408 unsigned iBit = (a_iBit); \
1409 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1410 if (iBit) \
1411 { \
1412 *puDst = --iBit; \
1413 fEfl |= g_afParity[iBit]; \
1414 } \
1415 else \
1416 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1417 *pfEFlags = fEfl; \
1418 } while (0)
1419#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1420 unsigned const iBit = (a_iBit); \
1421 if (iBit) \
1422 { \
1423 *puDst = iBit - 1; \
1424 *pfEFlags &= ~X86_EFL_ZF; \
1425 } \
1426 else \
1427 *pfEFlags |= X86_EFL_ZF; \
1428 } while (0)
1429
1430
1431/*
1432 * BSF - first (least significant) bit set
1433 */
1434IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1435{
1436 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1437}
1438
1439IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1440{
1441 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1442}
1443
1444IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1445{
1446 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1447}
1448
1449# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1450
1451IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1452{
1453 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1454}
1455
1456IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1457{
1458 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1459}
1460
1461IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1462{
1463 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1464}
1465
1466
1467IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1468{
1469 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1470}
1471
1472IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1473{
1474 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1475}
1476
1477IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1478{
1479 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1480}
1481
1482# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1483
1484
1485/*
1486 * BSR - last (most significant) bit set
1487 */
1488IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1489{
1490 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1491}
1492
1493IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1494{
1495 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1496}
1497
1498IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1499{
1500 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1501}
1502
1503# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1504
1505IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1506{
1507 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1508}
1509
1510IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1511{
1512 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1513}
1514
1515IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1516{
1517 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1518}
1519
1520
1521IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1522{
1523 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1524}
1525
1526IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1527{
1528 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1529}
1530
1531IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1532{
1533 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1534}
1535
1536# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1537
1538
1539/*
1540 * Helpers for LZCNT and TZCNT.
1541 */
1542#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1543 unsigned const uResult = (a_uResult); \
1544 *(a_puDst) = uResult; \
1545 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1546 if (uResult) \
1547 fEfl |= g_afParity[uResult]; \
1548 else \
1549 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1550 if (!a_uSrc) \
1551 fEfl |= X86_EFL_CF; \
1552 *(a_pfEFlags) = fEfl; \
1553 } while (0)
1554#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1555 unsigned const uResult = (a_uResult); \
1556 *(a_puDst) = uResult; \
1557 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1558 if (!uResult) \
1559 fEfl |= X86_EFL_ZF; \
1560 if (!a_uSrc) \
1561 fEfl |= X86_EFL_CF; \
1562 *(a_pfEFlags) = fEfl; \
1563 } while (0)
1564
1565
1566/*
1567 * LZCNT - count leading zero bits.
1568 */
1569IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1570{
1571 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1572}
1573
1574IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1575{
1576 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1577}
1578
1579IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1580{
1581 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1582}
1583
1584# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1585
1586IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1587{
1588 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1589}
1590
1591IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1592{
1593 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1594}
1595
1596IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1597{
1598 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1599}
1600
1601
1602IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1603{
1604 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1605}
1606
1607IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1608{
1609 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1610}
1611
1612IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1613{
1614 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1615}
1616
1617# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1618
1619
1620/*
1621 * TZCNT - count leading zero bits.
1622 */
1623IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1624{
1625 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1626}
1627
1628IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1629{
1630 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1631}
1632
1633IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1634{
1635 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1636}
1637
1638# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1639
1640IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1641{
1642 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1643}
1644
1645IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1646{
1647 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1648}
1649
1650IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1651{
1652 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1653}
1654
1655
1656IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1657{
1658 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1659}
1660
1661IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1662{
1663 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1664}
1665
1666IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1667{
1668 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1669}
1670
1671# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1672#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1673
1674/*
1675 * BEXTR (BMI1 instruction)
1676 */
1677#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1678IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1679 a_Type uSrc2, uint32_t *pfEFlags)) \
1680{ \
1681 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1682 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1683 a_Type uResult; \
1684 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1685 if (iFirstBit < a_cBits) \
1686 { \
1687 uResult = uSrc1 >> iFirstBit; \
1688 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1689 if (cBits < a_cBits) \
1690 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1691 *puDst = uResult; \
1692 if (!uResult) \
1693 fEfl |= X86_EFL_ZF; \
1694 } \
1695 else \
1696 { \
1697 *puDst = uResult = 0; \
1698 fEfl |= X86_EFL_ZF; \
1699 } \
1700 /** @todo complete flag calculations. */ \
1701 *pfEFlags = fEfl; \
1702}
1703
1704EMIT_BEXTR(64, uint64_t, _fallback)
1705EMIT_BEXTR(32, uint32_t, _fallback)
1706#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1708#endif
1709#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1710EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1711#endif
1712
1713/*
1714 * BLSR (BMI1 instruction)
1715 */
1716#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1717IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1718{ \
1719 uint32_t fEfl1 = *pfEFlags; \
1720 uint32_t fEfl2 = fEfl1; \
1721 *puDst = uSrc; \
1722 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1723 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1724 \
1725 /* AMD: The carry flag is from the SUB operation. */ \
1726 /* 10890xe: PF always cleared? */ \
1727 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1728 fEfl2 |= fEfl1 & X86_EFL_CF; \
1729 *pfEFlags = fEfl2; \
1730}
1731
1732EMIT_BLSR(64, uint64_t, _fallback)
1733EMIT_BLSR(32, uint32_t, _fallback)
1734#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(64, uint64_t, RT_NOTHING)
1736#endif
1737#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1738EMIT_BLSR(32, uint32_t, RT_NOTHING)
1739#endif
1740
1741/*
1742 * BLSMSK (BMI1 instruction)
1743 */
1744#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1745IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1746{ \
1747 uint32_t fEfl1 = *pfEFlags; \
1748 uint32_t fEfl2 = fEfl1; \
1749 *puDst = uSrc; \
1750 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1751 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1752 \
1753 /* AMD: The carry flag is from the SUB operation. */ \
1754 /* 10890xe: PF always cleared? */ \
1755 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1756 fEfl2 |= fEfl1 & X86_EFL_CF; \
1757 *pfEFlags = fEfl2; \
1758}
1759
1760EMIT_BLSMSK(64, uint64_t, _fallback)
1761EMIT_BLSMSK(32, uint32_t, _fallback)
1762#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1764#endif
1765#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1766EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1767#endif
1768
1769/*
1770 * BLSI (BMI1 instruction)
1771 */
1772#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1773IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1774{ \
1775 uint32_t fEfl1 = *pfEFlags; \
1776 uint32_t fEfl2 = fEfl1; \
1777 *puDst = uSrc; \
1778 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1779 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1780 \
1781 /* AMD: The carry flag is from the SUB operation. */ \
1782 /* 10890xe: PF always cleared? */ \
1783 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1784 fEfl2 |= fEfl1 & X86_EFL_CF; \
1785 *pfEFlags = fEfl2; \
1786}
1787
1788EMIT_BLSI(64, uint64_t, _fallback)
1789EMIT_BLSI(32, uint32_t, _fallback)
1790#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(64, uint64_t, RT_NOTHING)
1792#endif
1793#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1794EMIT_BLSI(32, uint32_t, RT_NOTHING)
1795#endif
1796
1797/*
1798 * BZHI (BMI2 instruction)
1799 */
1800#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1801IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1802 a_Type uSrc2, uint32_t *pfEFlags)) \
1803{ \
1804 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1805 a_Type uResult; \
1806 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1807 if (iFirstBit < a_cBits) \
1808 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1809 else \
1810 { \
1811 uResult = uSrc1; \
1812 fEfl |= X86_EFL_CF; \
1813 } \
1814 *puDst = uResult; \
1815 fEfl |= X86_EFL_CALC_ZF(uResult); \
1816 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1817 *pfEFlags = fEfl; \
1818}
1819
1820EMIT_BZHI(64, uint64_t, _fallback)
1821EMIT_BZHI(32, uint32_t, _fallback)
1822#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(64, uint64_t, RT_NOTHING)
1824#endif
1825#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1826EMIT_BZHI(32, uint32_t, RT_NOTHING)
1827#endif
1828
1829/*
1830 * POPCNT
1831 */
1832RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1833{
1834 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1835 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1836 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1837 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1838};
1839
1840/** @todo Use native popcount where possible and employ some more efficient
1841 * algorithm here (or in asm.h fallback)! */
1842
1843DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1844{
1845 return g_abBitCounts6[ u16 & 0x3f]
1846 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1847 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1848}
1849
1850DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1851{
1852 return g_abBitCounts6[ u32 & 0x3f]
1853 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1855 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1856 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1857 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1858}
1859
1860DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1861{
1862 return g_abBitCounts6[ u64 & 0x3f]
1863 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1870 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1871 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1872 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1873}
1874
1875#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1876IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1877{ \
1878 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1879 a_Type uResult; \
1880 if (uSrc) \
1881 uResult = iemPopCountU ## a_cBits(uSrc); \
1882 else \
1883 { \
1884 fEfl |= X86_EFL_ZF; \
1885 uResult = 0; \
1886 } \
1887 *puDst = uResult; \
1888 *pfEFlags = fEfl; \
1889}
1890
1891EMIT_POPCNT(64, uint64_t, _fallback)
1892EMIT_POPCNT(32, uint32_t, _fallback)
1893EMIT_POPCNT(16, uint16_t, _fallback)
1894#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1896#endif
1897#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1898EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1899EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1900#endif
1901
1902
1903#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1904
1905/*
1906 * XCHG
1907 */
1908
1909IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1910{
1911#if ARCH_BITS >= 64
1912 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1913#else
1914 uint64_t uOldMem = *puMem;
1915 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1916 ASMNopPause();
1917 *puReg = uOldMem;
1918#endif
1919}
1920
1921# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1922
1923IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1924{
1925 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1926}
1927
1928
1929IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1930{
1931 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1932}
1933
1934
1935IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1936{
1937 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1938}
1939
1940# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1941
1942
1943/* Unlocked variants for fDisregardLock mode: */
1944
1945IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1946{
1947 uint64_t const uOld = *puMem;
1948 *puMem = *puReg;
1949 *puReg = uOld;
1950}
1951
1952# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1953
1954IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1955{
1956 uint32_t const uOld = *puMem;
1957 *puMem = *puReg;
1958 *puReg = uOld;
1959}
1960
1961
1962IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1963{
1964 uint16_t const uOld = *puMem;
1965 *puMem = *puReg;
1966 *puReg = uOld;
1967}
1968
1969
1970IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1971{
1972 uint8_t const uOld = *puMem;
1973 *puMem = *puReg;
1974 *puReg = uOld;
1975}
1976
1977# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1978
1979
1980/*
1981 * XADD and LOCK XADD.
1982 */
1983#define EMIT_XADD(a_cBitsWidth, a_Type) \
1984IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1985{ \
1986 a_Type uDst = *puDst; \
1987 a_Type uResult = uDst; \
1988 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1989 *puDst = uResult; \
1990 *puReg = uDst; \
1991} \
1992\
1993IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1994{ \
1995 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1996 a_Type uResult; \
1997 uint32_t fEflTmp; \
1998 do \
1999 { \
2000 uResult = uOld; \
2001 fEflTmp = *pfEFlags; \
2002 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2003 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2004 *puReg = uOld; \
2005 *pfEFlags = fEflTmp; \
2006}
2007EMIT_XADD(64, uint64_t)
2008# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2009EMIT_XADD(32, uint32_t)
2010EMIT_XADD(16, uint16_t)
2011EMIT_XADD(8, uint8_t)
2012# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2013
2014#endif
2015
2016/*
2017 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2018 *
2019 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2020 * instructions are emulated as locked.
2021 */
2022#if defined(IEM_WITHOUT_ASSEMBLY)
2023
2024IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2025{
2026 uint8_t uOld = *puAl;
2027 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2028 Assert(*puAl == uOld);
2029 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2030}
2031
2032
2033IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2034{
2035 uint16_t uOld = *puAx;
2036 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2037 Assert(*puAx == uOld);
2038 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2039}
2040
2041
2042IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2043{
2044 uint32_t uOld = *puEax;
2045 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2046 Assert(*puEax == uOld);
2047 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2048}
2049
2050
2051# if ARCH_BITS == 32
2052IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2053# else
2054IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2055# endif
2056{
2057# if ARCH_BITS == 32
2058 uint64_t const uSrcReg = *puSrcReg;
2059# endif
2060 uint64_t uOld = *puRax;
2061 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2062 Assert(*puRax == uOld);
2063 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2064}
2065
2066
2067IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2068 uint32_t *pEFlags))
2069{
2070 uint64_t const uNew = pu64EbxEcx->u;
2071 uint64_t const uOld = pu64EaxEdx->u;
2072 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2073 {
2074 Assert(pu64EaxEdx->u == uOld);
2075 *pEFlags |= X86_EFL_ZF;
2076 }
2077 else
2078 *pEFlags &= ~X86_EFL_ZF;
2079}
2080
2081
2082# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2083IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2084 uint32_t *pEFlags))
2085{
2086# ifdef VBOX_STRICT
2087 RTUINT128U const uOld = *pu128RaxRdx;
2088# endif
2089# if defined(RT_ARCH_AMD64)
2090 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2091 &pu128RaxRdx->u))
2092# else
2093 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2094# endif
2095 {
2096 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2097 *pEFlags |= X86_EFL_ZF;
2098 }
2099 else
2100 *pEFlags &= ~X86_EFL_ZF;
2101}
2102# endif
2103
2104#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2105
2106# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2107IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2108 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2109{
2110 RTUINT128U u128Tmp = *pu128Dst;
2111 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2112 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2113 {
2114 *pu128Dst = *pu128RbxRcx;
2115 *pEFlags |= X86_EFL_ZF;
2116 }
2117 else
2118 {
2119 *pu128RaxRdx = u128Tmp;
2120 *pEFlags &= ~X86_EFL_ZF;
2121 }
2122}
2123#endif /* !RT_ARCH_ARM64 */
2124
2125#if defined(IEM_WITHOUT_ASSEMBLY)
2126
2127/* Unlocked versions mapped to the locked ones: */
2128
2129IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2130{
2131 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2132}
2133
2134
2135IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2136{
2137# if 0
2138 /* If correctly aligned, used the locked variation. */
2139 if (!((uintptr_t)pu16Dst & 1))
2140 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2141 else
2142# endif
2143 {
2144 /* Otherwise emulate it as best as we can. */
2145 uint16_t const uOld = *puAx;
2146 uint16_t const uDst = *pu16Dst;
2147 if (uOld == uDst)
2148 {
2149 *pu16Dst = uSrcReg;
2150 iemAImpl_cmp_u16(&uOld, uOld, pEFlags);
2151 }
2152 else
2153 {
2154 *puAx = uDst;
2155 iemAImpl_cmp_u16(&uOld, uDst, pEFlags);
2156 }
2157 }
2158}
2159
2160
2161IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2162{
2163# if 0
2164 /* If correctly aligned, used the locked variation. */
2165 if (!((uintptr_t)pu32Dst & 3))
2166 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2167 else
2168# endif
2169 {
2170 /* Otherwise emulate it as best as we can. */
2171 uint32_t const uOld = *puEax;
2172 uint32_t const uDst = *pu32Dst;
2173 if (uOld == uDst)
2174 {
2175 *pu32Dst = uSrcReg;
2176 iemAImpl_cmp_u32(&uOld, uOld, pEFlags);
2177 }
2178 else
2179 {
2180 *puEax = uDst;
2181 iemAImpl_cmp_u32(&uOld, uDst, pEFlags);
2182 }
2183 }
2184}
2185
2186
2187# if ARCH_BITS == 32
2188IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2189{
2190# if 0
2191 /* If correctly aligned, used the locked variation. */
2192 if (!((uintptr_t)pu32Dst & 7))
2193 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2194 else
2195# endif
2196 {
2197 /* Otherwise emulate it as best as we can. */
2198 uint64_t const uOld = *puRax;
2199 uint64_t const uSrc = *puSrcReg;
2200 uint64_t const uDst = *pu64Dst;
2201 if (uOld == uDst)
2202 {
2203 *pu64Dst = uSrc;
2204 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2205 }
2206 else
2207 {
2208 *puRax = uDst;
2209 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2210 }
2211 }
2212}
2213# else /* ARCH_BITS != 32 */
2214IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2215{
2216# if 0
2217 /* If correctly aligned, used the locked variation. */
2218 if (!((uintptr_t)pu64Dst & 7))
2219 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2220 else
2221# endif
2222 {
2223 /* Otherwise emulate it as best as we can. */
2224 uint64_t const uOld = *puRax;
2225 uint64_t const uDst = *pu64Dst;
2226 if (uOld == uDst)
2227 {
2228 *pu64Dst = uSrcReg;
2229 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2230 }
2231 else
2232 {
2233 *puRax = uDst;
2234 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2235 }
2236 }
2237}
2238# endif /* ARCH_BITS != 32 */
2239
2240
2241IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2242{
2243# if 0
2244 /* If correctly aligned, used the locked variation. */
2245 if (!((uintptr_t)pu64Dst & 7))
2246 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2247 else
2248# endif
2249 {
2250 /* Otherwise emulate it as best as we can. */
2251 uint64_t const uNew = pu64EbxEcx->u;
2252 uint64_t const uOld = pu64EaxEdx->u;
2253 uint64_t const uDst = *pu64Dst;
2254 if (uDst == uOld)
2255 {
2256 *pu64Dst = uNew;
2257 *pEFlags |= X86_EFL_ZF;
2258 }
2259 else
2260 {
2261 pu64EaxEdx->u = uDst;
2262 *pEFlags &= ~X86_EFL_ZF;
2263 }
2264 }
2265}
2266
2267
2268IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2269 uint32_t *pEFlags))
2270{
2271# if 0
2272 /* If correctly aligned, used the locked variation. */
2273 if (!((uintptr_t)pu64Dst & 15))
2274 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2275 else
2276# endif
2277 {
2278 /* Otherwise emulate it as best as we can. */
2279# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2280 uint128_t const uNew = pu128RbxRcx->u;
2281 uint128_t const uOld = pu128RaxRdx->u;
2282 uint128_t const uDst = pu128Dst->u;
2283 if (uDst == uOld)
2284 {
2285 pu128Dst->u = uNew;
2286 *pEFlags |= X86_EFL_ZF;
2287 }
2288 else
2289 {
2290 pu128RaxRdx->u = uDst;
2291 *pEFlags &= ~X86_EFL_ZF;
2292 }
2293# else
2294 RTUINT128U const uNew = *pu128RbxRcx;
2295 RTUINT128U const uOld = *pu128RaxRdx;
2296 RTUINT128U const uDst = *pu128Dst;
2297 if ( uDst.s.Lo == uOld.s.Lo
2298 && uDst.s.Hi == uOld.s.Hi)
2299 {
2300 *pu128Dst = uNew;
2301 *pEFlags |= X86_EFL_ZF;
2302 }
2303 else
2304 {
2305 *pu128RaxRdx = uDst;
2306 *pEFlags &= ~X86_EFL_ZF;
2307 }
2308# endif
2309 }
2310}
2311
2312#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2313
2314#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2315 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2316
2317/*
2318 * MUL, IMUL, DIV and IDIV helpers.
2319 *
2320 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2321 * division step so we can select between using C operators and
2322 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2323 *
2324 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2325 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2326 * input loads and the result storing.
2327 */
2328
2329DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2330{
2331# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2332 pQuotient->s.Lo = 0;
2333 pQuotient->s.Hi = 0;
2334# endif
2335 RTUINT128U Divisor;
2336 Divisor.s.Lo = u64Divisor;
2337 Divisor.s.Hi = 0;
2338 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2339}
2340
2341# define DIV_LOAD(a_Dividend) \
2342 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2343# define DIV_LOAD_U8(a_Dividend) \
2344 a_Dividend.u = *puAX
2345
2346# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2347# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2348
2349# define MUL_LOAD_F1() *puA
2350# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2351
2352# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2353# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2354
2355# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2356 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2357# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2358 RTUInt128AssignNeg(&(a_Value))
2359
2360# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2361 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2362# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2363 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2364
2365# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2366 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2367 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2368# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2369 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2370
2371
2372/*
2373 * MUL
2374 */
2375# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2376IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2377{ \
2378 RTUINT ## a_cBitsWidth2x ## U Result; \
2379 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2380 a_fnStore(Result); \
2381 \
2382 /* Calc EFLAGS: */ \
2383 uint32_t fEfl = *pfEFlags; \
2384 if (a_fIntelFlags) \
2385 { /* Intel: 6700K and 10980XE behavior */ \
2386 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2387 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2388 fEfl |= X86_EFL_SF; \
2389 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2390 if (Result.s.Hi != 0) \
2391 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2392 } \
2393 else \
2394 { /* AMD: 3990X */ \
2395 if (Result.s.Hi != 0) \
2396 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2397 else \
2398 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2399 } \
2400 *pfEFlags = fEfl; \
2401 return 0; \
2402} \
2403
2404# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2405 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2406 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2407 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2408
2409# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2410EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2411 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2412# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2413EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2414 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2415EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2416 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2417EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2418 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2419# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2420# endif /* !DOXYGEN_RUNNING */
2421
2422/*
2423 * MULX
2424 */
2425# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2426IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2427 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2428{ \
2429 RTUINT ## a_cBitsWidth2x ## U Result; \
2430 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2431 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2432 *puDst1 = Result.s.Hi; \
2433} \
2434
2435# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2436EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2437EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2438# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2439EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2440EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2441# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2442# endif /* !DOXYGEN_RUNNING */
2443
2444
2445/*
2446 * IMUL
2447 *
2448 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2449 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2450 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2451 */
2452# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2453 a_Suffix, a_fIntelFlags) \
2454IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2455{ \
2456 RTUINT ## a_cBitsWidth2x ## U Result; \
2457 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2458 \
2459 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2460 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2461 { \
2462 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2463 { \
2464 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2465 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2466 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2467 } \
2468 else \
2469 { \
2470 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2471 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2472 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2473 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2474 a_fnNeg(Result, a_cBitsWidth2x); \
2475 } \
2476 } \
2477 else \
2478 { \
2479 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2480 { \
2481 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2482 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2483 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2484 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2485 a_fnNeg(Result, a_cBitsWidth2x); \
2486 } \
2487 else \
2488 { \
2489 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2490 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2491 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2492 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2493 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2494 } \
2495 } \
2496 a_fnStore(Result); \
2497 \
2498 if (a_fIntelFlags) \
2499 { \
2500 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2501 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2502 fEfl |= X86_EFL_SF; \
2503 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2504 } \
2505 *pfEFlags = fEfl; \
2506 return 0; \
2507}
2508# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2509 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2510 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2511 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2512
2513# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2514EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2515 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2516# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2517EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2518 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2519EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2520 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2521EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2522 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2523# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2524# endif /* !DOXYGEN_RUNNING */
2525
2526
2527/*
2528 * IMUL with two operands are mapped onto the three operand variant, ignoring
2529 * the high part of the product.
2530 */
2531# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2532IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2533{ \
2534 a_uType uIgn; \
2535 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2536} \
2537\
2538IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2539{ \
2540 a_uType uIgn; \
2541 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2542} \
2543\
2544IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2545{ \
2546 a_uType uIgn; \
2547 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2548}
2549
2550EMIT_IMUL_TWO(64, uint64_t)
2551# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2552EMIT_IMUL_TWO(32, uint32_t)
2553EMIT_IMUL_TWO(16, uint16_t)
2554# endif
2555
2556
2557/*
2558 * DIV
2559 */
2560# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2561 a_Suffix, a_fIntelFlags) \
2562IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2563{ \
2564 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2565 a_fnLoad(Dividend); \
2566 if ( uDivisor != 0 \
2567 && Dividend.s.Hi < uDivisor) \
2568 { \
2569 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2570 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2571 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2572 \
2573 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2574 if (!a_fIntelFlags) \
2575 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2576 return 0; \
2577 } \
2578 /* #DE */ \
2579 return -1; \
2580}
2581# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2582 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2583 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2584 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2585
2586# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2587EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2588 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2589# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2590EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2591 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2592EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2593 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2594EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2595 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2596# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2597# endif /* !DOXYGEN_RUNNING */
2598
2599
2600/*
2601 * IDIV
2602 *
2603 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2604 * set AF and clear PF, ZF and SF just like it does for DIV.
2605 *
2606 */
2607# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2608 a_Suffix, a_fIntelFlags) \
2609IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2610{ \
2611 /* Note! Skylake leaves all flags alone. */ \
2612 \
2613 /** @todo overflow checks */ \
2614 if (uDivisor != 0) \
2615 { \
2616 /* \
2617 * Convert to unsigned division. \
2618 */ \
2619 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2620 a_fnLoad(Dividend); \
2621 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2622 if (fSignedDividend) \
2623 a_fnNeg(Dividend, a_cBitsWidth2x); \
2624 \
2625 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2626 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2627 uDivisorPositive = uDivisor; \
2628 else \
2629 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2630 \
2631 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2632 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2633 \
2634 /* \
2635 * Setup the result, checking for overflows. \
2636 */ \
2637 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2638 { \
2639 if (!fSignedDividend) \
2640 { \
2641 /* Positive divisor, positive dividend => result positive. */ \
2642 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2643 { \
2644 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2645 if (!a_fIntelFlags) \
2646 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2647 return 0; \
2648 } \
2649 } \
2650 else \
2651 { \
2652 /* Positive divisor, negative dividend => result negative. */ \
2653 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2654 { \
2655 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2656 if (!a_fIntelFlags) \
2657 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2658 return 0; \
2659 } \
2660 } \
2661 } \
2662 else \
2663 { \
2664 if (!fSignedDividend) \
2665 { \
2666 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2667 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2668 { \
2669 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2670 if (!a_fIntelFlags) \
2671 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2672 return 0; \
2673 } \
2674 } \
2675 else \
2676 { \
2677 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2678 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2679 { \
2680 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2681 if (!a_fIntelFlags) \
2682 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2683 return 0; \
2684 } \
2685 } \
2686 } \
2687 } \
2688 /* #DE */ \
2689 return -1; \
2690}
2691# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2692 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2693 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2694 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2695
2696# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2697EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2698 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2699# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2700EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2701 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2702EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2703 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2704EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2705 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2706# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2707# endif /* !DOXYGEN_RUNNING */
2708
2709#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2710
2711
2712/*********************************************************************************************************************************
2713* Unary operations. *
2714*********************************************************************************************************************************/
2715#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2716
2717/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2718 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2719 *
2720 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2721 * borrowing in arithmetic loops on intel 8008).
2722 *
2723 * @returns Status bits.
2724 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2725 * @param a_uResult Unsigned result value.
2726 * @param a_uDst The original destination value (for AF calc).
2727 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2728 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2729 */
2730#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2731 do { \
2732 uint32_t fEflTmp = *(a_pfEFlags); \
2733 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2734 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2735 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2736 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2737 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2738 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2739 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2740 *(a_pfEFlags) = fEflTmp; \
2741 } while (0)
2742
2743/*
2744 * INC
2745 */
2746
2747IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2748{
2749 uint64_t uDst = *puDst;
2750 uint64_t uResult = uDst + 1;
2751 *puDst = uResult;
2752 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2753}
2754
2755# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2756
2757IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2758{
2759 uint32_t uDst = *puDst;
2760 uint32_t uResult = uDst + 1;
2761 *puDst = uResult;
2762 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2763}
2764
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint16_t uDst = *puDst;
2769 uint16_t uResult = uDst + 1;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2772}
2773
2774IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2775{
2776 uint8_t uDst = *puDst;
2777 uint8_t uResult = uDst + 1;
2778 *puDst = uResult;
2779 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2780}
2781
2782# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2783
2784
2785/*
2786 * DEC
2787 */
2788
2789IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2790{
2791 uint64_t uDst = *puDst;
2792 uint64_t uResult = uDst - 1;
2793 *puDst = uResult;
2794 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2795}
2796
2797# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2798
2799IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2800{
2801 uint32_t uDst = *puDst;
2802 uint32_t uResult = uDst - 1;
2803 *puDst = uResult;
2804 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2805}
2806
2807
2808IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2809{
2810 uint16_t uDst = *puDst;
2811 uint16_t uResult = uDst - 1;
2812 *puDst = uResult;
2813 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2814}
2815
2816
2817IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2818{
2819 uint8_t uDst = *puDst;
2820 uint8_t uResult = uDst - 1;
2821 *puDst = uResult;
2822 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2823}
2824
2825# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2826
2827
2828/*
2829 * NOT
2830 */
2831
2832IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2833{
2834 uint64_t uDst = *puDst;
2835 uint64_t uResult = ~uDst;
2836 *puDst = uResult;
2837 /* EFLAGS are not modified. */
2838 RT_NOREF_PV(pfEFlags);
2839}
2840
2841# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2842
2843IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2844{
2845 uint32_t uDst = *puDst;
2846 uint32_t uResult = ~uDst;
2847 *puDst = uResult;
2848 /* EFLAGS are not modified. */
2849 RT_NOREF_PV(pfEFlags);
2850}
2851
2852IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2853{
2854 uint16_t uDst = *puDst;
2855 uint16_t uResult = ~uDst;
2856 *puDst = uResult;
2857 /* EFLAGS are not modified. */
2858 RT_NOREF_PV(pfEFlags);
2859}
2860
2861IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2862{
2863 uint8_t uDst = *puDst;
2864 uint8_t uResult = ~uDst;
2865 *puDst = uResult;
2866 /* EFLAGS are not modified. */
2867 RT_NOREF_PV(pfEFlags);
2868}
2869
2870# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2871
2872
2873/*
2874 * NEG
2875 */
2876
2877/**
2878 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2879 *
2880 * @returns Status bits.
2881 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2882 * @param a_uResult Unsigned result value.
2883 * @param a_uDst The original destination value (for AF calc).
2884 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2885 */
2886#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2887 do { \
2888 uint32_t fEflTmp = *(a_pfEFlags); \
2889 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2890 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2891 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2892 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2893 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2894 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2895 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2896 *(a_pfEFlags) = fEflTmp; \
2897 } while (0)
2898
2899IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2900{
2901 uint64_t uDst = *puDst;
2902 uint64_t uResult = (uint64_t)0 - uDst;
2903 *puDst = uResult;
2904 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2905}
2906
2907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2908
2909IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2910{
2911 uint32_t uDst = *puDst;
2912 uint32_t uResult = (uint32_t)0 - uDst;
2913 *puDst = uResult;
2914 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2915}
2916
2917
2918IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2919{
2920 uint16_t uDst = *puDst;
2921 uint16_t uResult = (uint16_t)0 - uDst;
2922 *puDst = uResult;
2923 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2924}
2925
2926
2927IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2928{
2929 uint8_t uDst = *puDst;
2930 uint8_t uResult = (uint8_t)0 - uDst;
2931 *puDst = uResult;
2932 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2933}
2934
2935# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2936
2937/*
2938 * Locked variants.
2939 */
2940
2941/** Emit a function for doing a locked unary operand operation. */
2942# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2943 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2944 uint32_t *pfEFlags)) \
2945 { \
2946 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2947 uint ## a_cBitsWidth ## _t uTmp; \
2948 uint32_t fEflTmp; \
2949 do \
2950 { \
2951 uTmp = uOld; \
2952 fEflTmp = *pfEFlags; \
2953 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2954 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2955 *pfEFlags = fEflTmp; \
2956 }
2957
2958EMIT_LOCKED_UNARY_OP(inc, 64)
2959EMIT_LOCKED_UNARY_OP(dec, 64)
2960EMIT_LOCKED_UNARY_OP(not, 64)
2961EMIT_LOCKED_UNARY_OP(neg, 64)
2962# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2963EMIT_LOCKED_UNARY_OP(inc, 32)
2964EMIT_LOCKED_UNARY_OP(dec, 32)
2965EMIT_LOCKED_UNARY_OP(not, 32)
2966EMIT_LOCKED_UNARY_OP(neg, 32)
2967
2968EMIT_LOCKED_UNARY_OP(inc, 16)
2969EMIT_LOCKED_UNARY_OP(dec, 16)
2970EMIT_LOCKED_UNARY_OP(not, 16)
2971EMIT_LOCKED_UNARY_OP(neg, 16)
2972
2973EMIT_LOCKED_UNARY_OP(inc, 8)
2974EMIT_LOCKED_UNARY_OP(dec, 8)
2975EMIT_LOCKED_UNARY_OP(not, 8)
2976EMIT_LOCKED_UNARY_OP(neg, 8)
2977# endif
2978
2979#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2980
2981
2982/*********************************************************************************************************************************
2983* Shifting and Rotating *
2984*********************************************************************************************************************************/
2985
2986/*
2987 * ROL
2988 */
2989#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2990IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2991{ \
2992 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2993 if (cShift) \
2994 { \
2995 if (a_cBitsWidth < 32) \
2996 cShift &= a_cBitsWidth - 1; \
2997 a_uType const uDst = *puDst; \
2998 a_uType const uResult = a_fnHlp(uDst, cShift); \
2999 *puDst = uResult; \
3000 \
3001 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3002 it the same way as for 1 bit shifts. */ \
3003 AssertCompile(X86_EFL_CF_BIT == 0); \
3004 uint32_t fEfl = *pfEFlags; \
3005 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3006 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3007 fEfl |= fCarry; \
3008 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3009 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3010 else /* Intel 10980XE: According to the first sub-shift: */ \
3011 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3012 *pfEFlags = fEfl; \
3013 } \
3014}
3015
3016#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3017EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3018#endif
3019EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3020EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3021
3022#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3023EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3024#endif
3025EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3026EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3027
3028DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3029{
3030 return (uValue << cShift) | (uValue >> (16 - cShift));
3031}
3032#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3033EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3034#endif
3035EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3036EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3037
3038DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3039{
3040 return (uValue << cShift) | (uValue >> (8 - cShift));
3041}
3042#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3043EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3044#endif
3045EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3046EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3047
3048
3049/*
3050 * ROR
3051 */
3052#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3053IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3054{ \
3055 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3056 if (cShift) \
3057 { \
3058 if (a_cBitsWidth < 32) \
3059 cShift &= a_cBitsWidth - 1; \
3060 a_uType const uDst = *puDst; \
3061 a_uType const uResult = a_fnHlp(uDst, cShift); \
3062 *puDst = uResult; \
3063 \
3064 /* Calc EFLAGS: */ \
3065 AssertCompile(X86_EFL_CF_BIT == 0); \
3066 uint32_t fEfl = *pfEFlags; \
3067 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3068 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3069 fEfl |= fCarry; \
3070 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3071 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3072 else /* Intel 10980XE: According to the first sub-shift: */ \
3073 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3074 *pfEFlags = fEfl; \
3075 } \
3076}
3077
3078#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3079EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3080#endif
3081EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3082EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3083
3084#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3085EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3086#endif
3087EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3088EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3089
3090DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3091{
3092 return (uValue >> cShift) | (uValue << (16 - cShift));
3093}
3094#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3095EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3096#endif
3097EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3098EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3099
3100DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3101{
3102 return (uValue >> cShift) | (uValue << (8 - cShift));
3103}
3104#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3105EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3106#endif
3107EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3108EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3109
3110
3111/*
3112 * RCL
3113 */
3114#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3115IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3116{ \
3117 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3118 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3119 cShift %= a_cBitsWidth + 1; \
3120 if (cShift) \
3121 { \
3122 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3123 cShift %= a_cBitsWidth + 1; \
3124 a_uType const uDst = *puDst; \
3125 a_uType uResult = uDst << cShift; \
3126 if (cShift > 1) \
3127 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3128 \
3129 AssertCompile(X86_EFL_CF_BIT == 0); \
3130 uint32_t fEfl = *pfEFlags; \
3131 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3132 uResult |= (a_uType)fInCarry << (cShift - 1); \
3133 \
3134 *puDst = uResult; \
3135 \
3136 /* Calc EFLAGS. */ \
3137 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3138 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3139 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3140 fEfl |= fOutCarry; \
3141 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3142 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3143 else /* Intel 10980XE: According to the first sub-shift: */ \
3144 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3145 *pfEFlags = fEfl; \
3146 } \
3147}
3148
3149#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3150EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3151#endif
3152EMIT_RCL(64, uint64_t, _intel, 1)
3153EMIT_RCL(64, uint64_t, _amd, 0)
3154
3155#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3156EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3157#endif
3158EMIT_RCL(32, uint32_t, _intel, 1)
3159EMIT_RCL(32, uint32_t, _amd, 0)
3160
3161#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3162EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3163#endif
3164EMIT_RCL(16, uint16_t, _intel, 1)
3165EMIT_RCL(16, uint16_t, _amd, 0)
3166
3167#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3168EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3169#endif
3170EMIT_RCL(8, uint8_t, _intel, 1)
3171EMIT_RCL(8, uint8_t, _amd, 0)
3172
3173
3174/*
3175 * RCR
3176 */
3177#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3178IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3179{ \
3180 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3181 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3182 cShift %= a_cBitsWidth + 1; \
3183 if (cShift) \
3184 { \
3185 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3186 cShift %= a_cBitsWidth + 1; \
3187 a_uType const uDst = *puDst; \
3188 a_uType uResult = uDst >> cShift; \
3189 if (cShift > 1) \
3190 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3191 \
3192 AssertCompile(X86_EFL_CF_BIT == 0); \
3193 uint32_t fEfl = *pfEFlags; \
3194 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3195 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3196 *puDst = uResult; \
3197 \
3198 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3199 it the same way as for 1 bit shifts. */ \
3200 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3201 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3202 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3203 fEfl |= fOutCarry; \
3204 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3205 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3206 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3207 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3208 *pfEFlags = fEfl; \
3209 } \
3210}
3211
3212#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3213EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3214#endif
3215EMIT_RCR(64, uint64_t, _intel, 1)
3216EMIT_RCR(64, uint64_t, _amd, 0)
3217
3218#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3219EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3220#endif
3221EMIT_RCR(32, uint32_t, _intel, 1)
3222EMIT_RCR(32, uint32_t, _amd, 0)
3223
3224#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3225EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3226#endif
3227EMIT_RCR(16, uint16_t, _intel, 1)
3228EMIT_RCR(16, uint16_t, _amd, 0)
3229
3230#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3232#endif
3233EMIT_RCR(8, uint8_t, _intel, 1)
3234EMIT_RCR(8, uint8_t, _amd, 0)
3235
3236
3237/*
3238 * SHL
3239 */
3240#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3241IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3242{ \
3243 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3244 if (cShift) \
3245 { \
3246 a_uType const uDst = *puDst; \
3247 a_uType uResult = uDst << cShift; \
3248 *puDst = uResult; \
3249 \
3250 /* Calc EFLAGS. */ \
3251 AssertCompile(X86_EFL_CF_BIT == 0); \
3252 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3253 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3254 fEfl |= fCarry; \
3255 if (!a_fIntelFlags) \
3256 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3257 else \
3258 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3259 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3260 fEfl |= X86_EFL_CALC_ZF(uResult); \
3261 fEfl |= g_afParity[uResult & 0xff]; \
3262 if (!a_fIntelFlags) \
3263 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3264 *pfEFlags = fEfl; \
3265 } \
3266}
3267
3268#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3269EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3270#endif
3271EMIT_SHL(64, uint64_t, _intel, 1)
3272EMIT_SHL(64, uint64_t, _amd, 0)
3273
3274#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3275EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3276#endif
3277EMIT_SHL(32, uint32_t, _intel, 1)
3278EMIT_SHL(32, uint32_t, _amd, 0)
3279
3280#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3281EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3282#endif
3283EMIT_SHL(16, uint16_t, _intel, 1)
3284EMIT_SHL(16, uint16_t, _amd, 0)
3285
3286#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3287EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3288#endif
3289EMIT_SHL(8, uint8_t, _intel, 1)
3290EMIT_SHL(8, uint8_t, _amd, 0)
3291
3292
3293/*
3294 * SHR
3295 */
3296#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3297IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3298{ \
3299 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3300 if (cShift) \
3301 { \
3302 a_uType const uDst = *puDst; \
3303 a_uType uResult = uDst >> cShift; \
3304 *puDst = uResult; \
3305 \
3306 /* Calc EFLAGS. */ \
3307 AssertCompile(X86_EFL_CF_BIT == 0); \
3308 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3309 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3310 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3311 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3312 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3313 fEfl |= X86_EFL_CALC_ZF(uResult); \
3314 fEfl |= g_afParity[uResult & 0xff]; \
3315 if (!a_fIntelFlags) \
3316 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3317 *pfEFlags = fEfl; \
3318 } \
3319}
3320
3321#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3322EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3323#endif
3324EMIT_SHR(64, uint64_t, _intel, 1)
3325EMIT_SHR(64, uint64_t, _amd, 0)
3326
3327#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3328EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3329#endif
3330EMIT_SHR(32, uint32_t, _intel, 1)
3331EMIT_SHR(32, uint32_t, _amd, 0)
3332
3333#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3334EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3335#endif
3336EMIT_SHR(16, uint16_t, _intel, 1)
3337EMIT_SHR(16, uint16_t, _amd, 0)
3338
3339#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3340EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3341#endif
3342EMIT_SHR(8, uint8_t, _intel, 1)
3343EMIT_SHR(8, uint8_t, _amd, 0)
3344
3345
3346/*
3347 * SAR
3348 */
3349#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3350IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3351{ \
3352 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3353 if (cShift) \
3354 { \
3355 a_iType const iDst = (a_iType)*puDst; \
3356 a_uType uResult = iDst >> cShift; \
3357 *puDst = uResult; \
3358 \
3359 /* Calc EFLAGS. \
3360 Note! The OF flag is always zero because the result never differs from the input. */ \
3361 AssertCompile(X86_EFL_CF_BIT == 0); \
3362 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3363 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3364 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3365 fEfl |= X86_EFL_CALC_ZF(uResult); \
3366 fEfl |= g_afParity[uResult & 0xff]; \
3367 if (!a_fIntelFlags) \
3368 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3369 *pfEFlags = fEfl; \
3370 } \
3371}
3372
3373#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3374EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3375#endif
3376EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3377EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3378
3379#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3380EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3381#endif
3382EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3383EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3384
3385#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3386EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3387#endif
3388EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3389EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3390
3391#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3392EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3393#endif
3394EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3395EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3396
3397
3398/*
3399 * SHLD
3400 *
3401 * - CF is the last bit shifted out of puDst.
3402 * - AF is always cleared by Intel 10980XE.
3403 * - AF is always set by AMD 3990X.
3404 * - OF is set according to the first shift on Intel 10980XE, it seems.
3405 * - OF is set according to the last sub-shift on AMD 3990X.
3406 * - ZF, SF and PF are calculated according to the result by both vendors.
3407 *
3408 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3409 * pick either the source register or the destination register for input bits
3410 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3411 * intel has changed behaviour here several times. We implement what current
3412 * skylake based does for now, we can extend this later as needed.
3413 */
3414#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3415IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3416 uint32_t *pfEFlags)) \
3417{ \
3418 cShift &= a_cBitsWidth - 1; \
3419 if (cShift) \
3420 { \
3421 a_uType const uDst = *puDst; \
3422 a_uType uResult = uDst << cShift; \
3423 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3424 *puDst = uResult; \
3425 \
3426 /* CALC EFLAGS: */ \
3427 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3428 if (a_fIntelFlags) \
3429 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3430 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3431 else \
3432 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3433 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3434 fEfl |= X86_EFL_AF; \
3435 } \
3436 AssertCompile(X86_EFL_CF_BIT == 0); \
3437 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3438 fEfl |= g_afParity[uResult & 0xff]; \
3439 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3440 fEfl |= X86_EFL_CALC_ZF(uResult); \
3441 *pfEFlags = fEfl; \
3442 } \
3443}
3444
3445#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3446EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3447#endif
3448EMIT_SHLD(64, uint64_t, _intel, 1)
3449EMIT_SHLD(64, uint64_t, _amd, 0)
3450
3451#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3452EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3453#endif
3454EMIT_SHLD(32, uint32_t, _intel, 1)
3455EMIT_SHLD(32, uint32_t, _amd, 0)
3456
3457#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3458IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3459{ \
3460 cShift &= 31; \
3461 if (cShift) \
3462 { \
3463 uint16_t const uDst = *puDst; \
3464 uint64_t const uTmp = a_fIntelFlags \
3465 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3466 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3467 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3468 *puDst = uResult; \
3469 \
3470 /* CALC EFLAGS: */ \
3471 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3472 AssertCompile(X86_EFL_CF_BIT == 0); \
3473 if (a_fIntelFlags) \
3474 { \
3475 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3476 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3477 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3478 } \
3479 else \
3480 { \
3481 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3482 if (cShift < 16) \
3483 { \
3484 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3485 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3486 } \
3487 else \
3488 { \
3489 if (cShift == 16) \
3490 fEfl |= uDst & X86_EFL_CF; \
3491 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3492 } \
3493 fEfl |= X86_EFL_AF; \
3494 } \
3495 fEfl |= g_afParity[uResult & 0xff]; \
3496 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3497 fEfl |= X86_EFL_CALC_ZF(uResult); \
3498 *pfEFlags = fEfl; \
3499 } \
3500}
3501
3502#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3503EMIT_SHLD_16(RT_NOTHING, 1)
3504#endif
3505EMIT_SHLD_16(_intel, 1)
3506EMIT_SHLD_16(_amd, 0)
3507
3508
3509/*
3510 * SHRD
3511 *
3512 * EFLAGS behaviour seems to be the same as with SHLD:
3513 * - CF is the last bit shifted out of puDst.
3514 * - AF is always cleared by Intel 10980XE.
3515 * - AF is always set by AMD 3990X.
3516 * - OF is set according to the first shift on Intel 10980XE, it seems.
3517 * - OF is set according to the last sub-shift on AMD 3990X.
3518 * - ZF, SF and PF are calculated according to the result by both vendors.
3519 *
3520 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3521 * pick either the source register or the destination register for input bits
3522 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3523 * intel has changed behaviour here several times. We implement what current
3524 * skylake based does for now, we can extend this later as needed.
3525 */
3526#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3527IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3528{ \
3529 cShift &= a_cBitsWidth - 1; \
3530 if (cShift) \
3531 { \
3532 a_uType const uDst = *puDst; \
3533 a_uType uResult = uDst >> cShift; \
3534 uResult |= uSrc << (a_cBitsWidth - cShift); \
3535 *puDst = uResult; \
3536 \
3537 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3538 AssertCompile(X86_EFL_CF_BIT == 0); \
3539 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3540 if (a_fIntelFlags) \
3541 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3542 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3543 else \
3544 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3545 if (cShift > 1) /* Set according to last shift. */ \
3546 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3547 else \
3548 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3549 fEfl |= X86_EFL_AF; \
3550 } \
3551 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3552 fEfl |= X86_EFL_CALC_ZF(uResult); \
3553 fEfl |= g_afParity[uResult & 0xff]; \
3554 *pfEFlags = fEfl; \
3555 } \
3556}
3557
3558#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3559EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3560#endif
3561EMIT_SHRD(64, uint64_t, _intel, 1)
3562EMIT_SHRD(64, uint64_t, _amd, 0)
3563
3564#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3565EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3566#endif
3567EMIT_SHRD(32, uint32_t, _intel, 1)
3568EMIT_SHRD(32, uint32_t, _amd, 0)
3569
3570#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3571IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3572{ \
3573 cShift &= 31; \
3574 if (cShift) \
3575 { \
3576 uint16_t const uDst = *puDst; \
3577 uint64_t const uTmp = a_fIntelFlags \
3578 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3579 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3580 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3581 *puDst = uResult; \
3582 \
3583 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3584 AssertCompile(X86_EFL_CF_BIT == 0); \
3585 if (a_fIntelFlags) \
3586 { \
3587 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3588 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3589 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3590 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3591 } \
3592 else \
3593 { \
3594 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3595 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3596 /* AMD 3990X: Set according to last shift. AF always set. */ \
3597 if (cShift > 1) /* Set according to last shift. */ \
3598 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3599 else \
3600 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3601 fEfl |= X86_EFL_AF; \
3602 } \
3603 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3604 fEfl |= X86_EFL_CALC_ZF(uResult); \
3605 fEfl |= g_afParity[uResult & 0xff]; \
3606 *pfEFlags = fEfl; \
3607 } \
3608}
3609
3610#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3611EMIT_SHRD_16(RT_NOTHING, 1)
3612#endif
3613EMIT_SHRD_16(_intel, 1)
3614EMIT_SHRD_16(_amd, 0)
3615
3616
3617/*
3618 * RORX (BMI2)
3619 */
3620#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3621IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3622{ \
3623 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3624}
3625
3626#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3627EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3628#endif
3629#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3630EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3631#endif
3632
3633
3634/*
3635 * SHLX (BMI2)
3636 */
3637#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3638IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3639{ \
3640 cShift &= a_cBitsWidth - 1; \
3641 *puDst = uSrc << cShift; \
3642}
3643
3644#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3645EMIT_SHLX(64, uint64_t, RT_NOTHING)
3646EMIT_SHLX(64, uint64_t, _fallback)
3647#endif
3648#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3649EMIT_SHLX(32, uint32_t, RT_NOTHING)
3650EMIT_SHLX(32, uint32_t, _fallback)
3651#endif
3652
3653
3654/*
3655 * SHRX (BMI2)
3656 */
3657#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3658IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3659{ \
3660 cShift &= a_cBitsWidth - 1; \
3661 *puDst = uSrc >> cShift; \
3662}
3663
3664#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3665EMIT_SHRX(64, uint64_t, RT_NOTHING)
3666EMIT_SHRX(64, uint64_t, _fallback)
3667#endif
3668#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3669EMIT_SHRX(32, uint32_t, RT_NOTHING)
3670EMIT_SHRX(32, uint32_t, _fallback)
3671#endif
3672
3673
3674/*
3675 * SARX (BMI2)
3676 */
3677#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3678IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3679{ \
3680 cShift &= a_cBitsWidth - 1; \
3681 *puDst = (a_iType)uSrc >> cShift; \
3682}
3683
3684#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3685EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3686EMIT_SARX(64, uint64_t, int64_t, _fallback)
3687#endif
3688#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3689EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3690EMIT_SARX(32, uint32_t, int32_t, _fallback)
3691#endif
3692
3693
3694/*
3695 * PDEP (BMI2)
3696 */
3697#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3698IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3699{ \
3700 a_uType uResult = 0; \
3701 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3702 if (fMask & ((a_uType)1 << iMaskBit)) \
3703 { \
3704 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3705 iBit++; \
3706 } \
3707 *puDst = uResult; \
3708}
3709
3710#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3711EMIT_PDEP(64, uint64_t, RT_NOTHING)
3712#endif
3713EMIT_PDEP(64, uint64_t, _fallback)
3714#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3715EMIT_PDEP(32, uint32_t, RT_NOTHING)
3716#endif
3717EMIT_PDEP(32, uint32_t, _fallback)
3718
3719/*
3720 * PEXT (BMI2)
3721 */
3722#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3723IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3724{ \
3725 a_uType uResult = 0; \
3726 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3727 if (fMask & ((a_uType)1 << iMaskBit)) \
3728 { \
3729 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3730 iBit++; \
3731 } \
3732 *puDst = uResult; \
3733}
3734
3735#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3736EMIT_PEXT(64, uint64_t, RT_NOTHING)
3737#endif
3738EMIT_PEXT(64, uint64_t, _fallback)
3739#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3740EMIT_PEXT(32, uint32_t, RT_NOTHING)
3741#endif
3742EMIT_PEXT(32, uint32_t, _fallback)
3743
3744
3745#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3746
3747# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3748/*
3749 * BSWAP
3750 */
3751
3752IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3753{
3754 *puDst = ASMByteSwapU64(*puDst);
3755}
3756
3757
3758IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3759{
3760 *puDst = ASMByteSwapU32(*puDst);
3761}
3762
3763
3764/* Note! undocument, so 32-bit arg */
3765IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3766{
3767#if 0
3768 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3769#else
3770 /* This is the behaviour AMD 3990x (64-bit mode): */
3771 *(uint16_t *)puDst = 0;
3772#endif
3773}
3774
3775# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3776
3777
3778
3779# if defined(IEM_WITHOUT_ASSEMBLY)
3780
3781/*
3782 * LFENCE, SFENCE & MFENCE.
3783 */
3784
3785IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3786{
3787 ASMReadFence();
3788}
3789
3790
3791IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3792{
3793 ASMWriteFence();
3794}
3795
3796
3797IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3798{
3799 ASMMemoryFence();
3800}
3801
3802
3803# ifndef RT_ARCH_ARM64
3804IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3805{
3806 ASMMemoryFence();
3807}
3808# endif
3809
3810# endif
3811
3812#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3813
3814
3815IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3816{
3817 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3818 {
3819 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3820 *pu16Dst |= u16Src & X86_SEL_RPL;
3821
3822 *pfEFlags |= X86_EFL_ZF;
3823 }
3824 else
3825 *pfEFlags &= ~X86_EFL_ZF;
3826}
3827
3828
3829#if defined(IEM_WITHOUT_ASSEMBLY)
3830
3831/*********************************************************************************************************************************
3832* x87 FPU Loads *
3833*********************************************************************************************************************************/
3834
3835IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3836{
3837 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3838 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3839 {
3840 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3843 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3844 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3845 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3846 }
3847 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3848 {
3849 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3850 pFpuRes->r80Result.s.uExponent = 0;
3851 pFpuRes->r80Result.s.uMantissa = 0;
3852 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3853 }
3854 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3855 {
3856 /* Subnormal values gets normalized. */
3857 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3860 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3861 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3862 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3863 pFpuRes->FSW |= X86_FSW_DE;
3864 if (!(pFpuState->FCW & X86_FCW_DM))
3865 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3866 }
3867 else if (RTFLOAT32U_IS_INF(pr32Val))
3868 {
3869 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3870 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3871 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3872 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3873 }
3874 else
3875 {
3876 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3877 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3878 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3879 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3880 pFpuRes->r80Result.sj64.fInteger = 1;
3881 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3882 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3883 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3884 {
3885 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3886 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3887 pFpuRes->FSW |= X86_FSW_IE;
3888
3889 if (!(pFpuState->FCW & X86_FCW_IM))
3890 {
3891 /* The value is not pushed. */
3892 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3893 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3894 pFpuRes->r80Result.au64[0] = 0;
3895 pFpuRes->r80Result.au16[4] = 0;
3896 }
3897 }
3898 else
3899 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3900 }
3901}
3902
3903
3904IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3905{
3906 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3907 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3908 {
3909 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3910 pFpuRes->r80Result.sj64.fInteger = 1;
3911 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3912 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3913 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3914 }
3915 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3916 {
3917 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3918 pFpuRes->r80Result.s.uExponent = 0;
3919 pFpuRes->r80Result.s.uMantissa = 0;
3920 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3921 }
3922 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3923 {
3924 /* Subnormal values gets normalized. */
3925 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3926 pFpuRes->r80Result.sj64.fInteger = 1;
3927 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3928 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3929 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3930 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3931 pFpuRes->FSW |= X86_FSW_DE;
3932 if (!(pFpuState->FCW & X86_FCW_DM))
3933 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3934 }
3935 else if (RTFLOAT64U_IS_INF(pr64Val))
3936 {
3937 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3938 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3939 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3940 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3941 }
3942 else
3943 {
3944 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3945 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3946 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3947 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3948 pFpuRes->r80Result.sj64.fInteger = 1;
3949 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3950 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3951 {
3952 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3953 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3954 pFpuRes->FSW |= X86_FSW_IE;
3955
3956 if (!(pFpuState->FCW & X86_FCW_IM))
3957 {
3958 /* The value is not pushed. */
3959 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3960 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3961 pFpuRes->r80Result.au64[0] = 0;
3962 pFpuRes->r80Result.au16[4] = 0;
3963 }
3964 }
3965 else
3966 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3967 }
3968}
3969
3970
3971IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3972{
3973 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3974 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3975 /* Raises no exceptions. */
3976 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3977}
3978
3979
3980IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3981{
3982 pFpuRes->r80Result.sj64.fSign = 0;
3983 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3984 pFpuRes->r80Result.sj64.fInteger = 1;
3985 pFpuRes->r80Result.sj64.uFraction = 0;
3986
3987 /*
3988 * FPU status word:
3989 * - TOP is irrelevant, but we must match x86 assembly version.
3990 * - C1 is always cleared as we don't have any stack overflows.
3991 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3992 */
3993 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3994}
3995
3996
3997IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3998{
3999 pFpuRes->r80Result.sj64.fSign = 0;
4000 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4001 pFpuRes->r80Result.sj64.fInteger = 1;
4002 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4003 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4004 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4005 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4006}
4007
4008
4009IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4010{
4011 pFpuRes->r80Result.sj64.fSign = 0;
4012 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4013 pFpuRes->r80Result.sj64.fInteger = 1;
4014 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4015 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4016 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4017}
4018
4019
4020IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4021{
4022 pFpuRes->r80Result.sj64.fSign = 0;
4023 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4024 pFpuRes->r80Result.sj64.fInteger = 1;
4025 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4026 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4027 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4028 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4029}
4030
4031
4032IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4033{
4034 pFpuRes->r80Result.sj64.fSign = 0;
4035 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4036 pFpuRes->r80Result.sj64.fInteger = 1;
4037 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4038 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4039 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4040 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4041}
4042
4043
4044IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4045{
4046 pFpuRes->r80Result.sj64.fSign = 0;
4047 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4048 pFpuRes->r80Result.sj64.fInteger = 1;
4049 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4050 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4051 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4052 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4053}
4054
4055
4056IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4057{
4058 pFpuRes->r80Result.s.fSign = 0;
4059 pFpuRes->r80Result.s.uExponent = 0;
4060 pFpuRes->r80Result.s.uMantissa = 0;
4061 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4062}
4063
4064#define EMIT_FILD(a_cBits) \
4065IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4066 int ## a_cBits ## _t const *piVal)) \
4067{ \
4068 int ## a_cBits ## _t iVal = *piVal; \
4069 if (iVal == 0) \
4070 { \
4071 pFpuRes->r80Result.s.fSign = 0; \
4072 pFpuRes->r80Result.s.uExponent = 0; \
4073 pFpuRes->r80Result.s.uMantissa = 0; \
4074 } \
4075 else \
4076 { \
4077 if (iVal > 0) \
4078 pFpuRes->r80Result.s.fSign = 0; \
4079 else \
4080 { \
4081 pFpuRes->r80Result.s.fSign = 1; \
4082 iVal = -iVal; \
4083 } \
4084 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4085 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4086 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4087 } \
4088 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4089}
4090EMIT_FILD(16)
4091EMIT_FILD(32)
4092EMIT_FILD(64)
4093
4094
4095IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4096{
4097 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4098 if ( pd80Val->s.abPairs[0] == 0
4099 && pd80Val->s.abPairs[1] == 0
4100 && pd80Val->s.abPairs[2] == 0
4101 && pd80Val->s.abPairs[3] == 0
4102 && pd80Val->s.abPairs[4] == 0
4103 && pd80Val->s.abPairs[5] == 0
4104 && pd80Val->s.abPairs[6] == 0
4105 && pd80Val->s.abPairs[7] == 0
4106 && pd80Val->s.abPairs[8] == 0)
4107 {
4108 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4109 pFpuRes->r80Result.s.uExponent = 0;
4110 pFpuRes->r80Result.s.uMantissa = 0;
4111 }
4112 else
4113 {
4114 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4115
4116 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4117 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4118 cPairs--;
4119
4120 uint64_t uVal = 0;
4121 uint64_t uFactor = 1;
4122 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4123 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4124 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4125
4126 unsigned const cBits = ASMBitLastSetU64(uVal);
4127 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4128 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4129 }
4130}
4131
4132
4133/*********************************************************************************************************************************
4134* x87 FPU Stores *
4135*********************************************************************************************************************************/
4136
4137/**
4138 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4139 *
4140 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4141 *
4142 * @returns Updated FPU status word value.
4143 * @param fSignIn Incoming sign indicator.
4144 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4145 * @param iExponentIn Unbiased exponent.
4146 * @param fFcw The FPU control word.
4147 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4148 * @param pr32Dst Where to return the output value, if one should be
4149 * returned.
4150 *
4151 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4152 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4153 */
4154static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4155 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4156{
4157 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4158 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4159 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4160 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4161 ? fRoundingOffMask
4162 : 0;
4163 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4164
4165 /*
4166 * Deal with potential overflows/underflows first, optimizing for none.
4167 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4168 */
4169 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4170 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4171 { /* likely? */ }
4172 /*
4173 * Underflow if the exponent zero or negative. This is attempted mapped
4174 * to a subnormal number when possible, with some additional trickery ofc.
4175 */
4176 else if (iExponentOut <= 0)
4177 {
4178 bool const fIsTiny = iExponentOut < 0
4179 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4180 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4181 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4182 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4183
4184 if (iExponentOut <= 0)
4185 {
4186 uMantissaIn = iExponentOut <= -63
4187 ? uMantissaIn != 0
4188 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4189 fRoundedOff = uMantissaIn & fRoundingOffMask;
4190 if (fRoundedOff && fIsTiny)
4191 fFsw |= X86_FSW_UE;
4192 iExponentOut = 0;
4193 }
4194 }
4195 /*
4196 * Overflow if at or above max exponent value or if we will reach max
4197 * when rounding. Will return +/-zero or +/-max value depending on
4198 * whether we're rounding or not.
4199 */
4200 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4201 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4202 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4203 {
4204 fFsw |= X86_FSW_OE;
4205 if (!(fFcw & X86_FCW_OM))
4206 return fFsw | X86_FSW_ES | X86_FSW_B;
4207 fFsw |= X86_FSW_PE;
4208 if (uRoundingAdd)
4209 fFsw |= X86_FSW_C1;
4210 if (!(fFcw & X86_FCW_PM))
4211 fFsw |= X86_FSW_ES | X86_FSW_B;
4212
4213 pr32Dst->s.fSign = fSignIn;
4214 if (uRoundingAdd)
4215 { /* Zero */
4216 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4217 pr32Dst->s.uFraction = 0;
4218 }
4219 else
4220 { /* Max */
4221 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4222 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4223 }
4224 return fFsw;
4225 }
4226
4227 /*
4228 * Normal or subnormal number.
4229 */
4230 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4231 uint64_t uMantissaOut = uMantissaIn;
4232 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4233 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4234 || fRoundedOff != uRoundingAdd)
4235 {
4236 uMantissaOut = uMantissaIn + uRoundingAdd;
4237 if (uMantissaOut >= uMantissaIn)
4238 { /* likely */ }
4239 else
4240 {
4241 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4242 iExponentOut++;
4243 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4244 fFsw |= X86_FSW_C1;
4245 }
4246 }
4247 else
4248 uMantissaOut = uMantissaIn;
4249
4250 /* Truncate the mantissa and set the return value. */
4251 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4252
4253 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4254 pr32Dst->s.uExponent = iExponentOut;
4255 pr32Dst->s.fSign = fSignIn;
4256
4257 /* Set status flags realted to rounding. */
4258 if (fRoundedOff)
4259 {
4260 fFsw |= X86_FSW_PE;
4261 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4262 fFsw |= X86_FSW_C1;
4263 if (!(fFcw & X86_FCW_PM))
4264 fFsw |= X86_FSW_ES | X86_FSW_B;
4265 }
4266
4267 return fFsw;
4268}
4269
4270
4271/**
4272 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4273 */
4274IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4275 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4276{
4277 uint16_t const fFcw = pFpuState->FCW;
4278 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4279 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4280 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4281 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4282 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4283 {
4284 pr32Dst->s.fSign = pr80Src->s.fSign;
4285 pr32Dst->s.uExponent = 0;
4286 pr32Dst->s.uFraction = 0;
4287 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4288 }
4289 else if (RTFLOAT80U_IS_INF(pr80Src))
4290 {
4291 pr32Dst->s.fSign = pr80Src->s.fSign;
4292 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4293 pr32Dst->s.uFraction = 0;
4294 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4295 }
4296 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4297 {
4298 /* Mapped to +/-QNaN */
4299 pr32Dst->s.fSign = pr80Src->s.fSign;
4300 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4301 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4302 }
4303 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4304 {
4305 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4306 if (fFcw & X86_FCW_IM)
4307 {
4308 pr32Dst->s.fSign = 1;
4309 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4310 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4311 fFsw |= X86_FSW_IE;
4312 }
4313 else
4314 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4315 }
4316 else if (RTFLOAT80U_IS_NAN(pr80Src))
4317 {
4318 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4319 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4320 {
4321 pr32Dst->s.fSign = pr80Src->s.fSign;
4322 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4323 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4324 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4325 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4326 fFsw |= X86_FSW_IE;
4327 }
4328 else
4329 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4330 }
4331 else
4332 {
4333 /* Denormal values causes both an underflow and precision exception. */
4334 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4335 if (fFcw & X86_FCW_UM)
4336 {
4337 pr32Dst->s.fSign = pr80Src->s.fSign;
4338 pr32Dst->s.uExponent = 0;
4339 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4340 {
4341 pr32Dst->s.uFraction = 1;
4342 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4343 if (!(fFcw & X86_FCW_PM))
4344 fFsw |= X86_FSW_ES | X86_FSW_B;
4345 }
4346 else
4347 {
4348 pr32Dst->s.uFraction = 0;
4349 fFsw |= X86_FSW_UE | X86_FSW_PE;
4350 if (!(fFcw & X86_FCW_PM))
4351 fFsw |= X86_FSW_ES | X86_FSW_B;
4352 }
4353 }
4354 else
4355 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4356 }
4357 *pu16FSW = fFsw;
4358}
4359
4360
4361/**
4362 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4363 *
4364 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4365 *
4366 * @returns Updated FPU status word value.
4367 * @param fSignIn Incoming sign indicator.
4368 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4369 * @param iExponentIn Unbiased exponent.
4370 * @param fFcw The FPU control word.
4371 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4372 * @param pr64Dst Where to return the output value, if one should be
4373 * returned.
4374 *
4375 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4376 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4377 */
4378static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4379 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4380{
4381 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4382 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4383 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4384 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4385 ? fRoundingOffMask
4386 : 0;
4387 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4388
4389 /*
4390 * Deal with potential overflows/underflows first, optimizing for none.
4391 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4392 */
4393 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4394 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4395 { /* likely? */ }
4396 /*
4397 * Underflow if the exponent zero or negative. This is attempted mapped
4398 * to a subnormal number when possible, with some additional trickery ofc.
4399 */
4400 else if (iExponentOut <= 0)
4401 {
4402 bool const fIsTiny = iExponentOut < 0
4403 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4404 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4405 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4406 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4407
4408 if (iExponentOut <= 0)
4409 {
4410 uMantissaIn = iExponentOut <= -63
4411 ? uMantissaIn != 0
4412 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4413 fRoundedOff = uMantissaIn & fRoundingOffMask;
4414 if (fRoundedOff && fIsTiny)
4415 fFsw |= X86_FSW_UE;
4416 iExponentOut = 0;
4417 }
4418 }
4419 /*
4420 * Overflow if at or above max exponent value or if we will reach max
4421 * when rounding. Will return +/-zero or +/-max value depending on
4422 * whether we're rounding or not.
4423 */
4424 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4425 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4426 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4427 {
4428 fFsw |= X86_FSW_OE;
4429 if (!(fFcw & X86_FCW_OM))
4430 return fFsw | X86_FSW_ES | X86_FSW_B;
4431 fFsw |= X86_FSW_PE;
4432 if (uRoundingAdd)
4433 fFsw |= X86_FSW_C1;
4434 if (!(fFcw & X86_FCW_PM))
4435 fFsw |= X86_FSW_ES | X86_FSW_B;
4436
4437 pr64Dst->s64.fSign = fSignIn;
4438 if (uRoundingAdd)
4439 { /* Zero */
4440 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4441 pr64Dst->s64.uFraction = 0;
4442 }
4443 else
4444 { /* Max */
4445 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4446 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4447 }
4448 return fFsw;
4449 }
4450
4451 /*
4452 * Normal or subnormal number.
4453 */
4454 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4455 uint64_t uMantissaOut = uMantissaIn;
4456 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4457 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4458 || fRoundedOff != uRoundingAdd)
4459 {
4460 uMantissaOut = uMantissaIn + uRoundingAdd;
4461 if (uMantissaOut >= uMantissaIn)
4462 { /* likely */ }
4463 else
4464 {
4465 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4466 iExponentOut++;
4467 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4468 fFsw |= X86_FSW_C1;
4469 }
4470 }
4471 else
4472 uMantissaOut = uMantissaIn;
4473
4474 /* Truncate the mantissa and set the return value. */
4475 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4476
4477 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4478 pr64Dst->s64.uExponent = iExponentOut;
4479 pr64Dst->s64.fSign = fSignIn;
4480
4481 /* Set status flags realted to rounding. */
4482 if (fRoundedOff)
4483 {
4484 fFsw |= X86_FSW_PE;
4485 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4486 fFsw |= X86_FSW_C1;
4487 if (!(fFcw & X86_FCW_PM))
4488 fFsw |= X86_FSW_ES | X86_FSW_B;
4489 }
4490
4491 return fFsw;
4492}
4493
4494
4495/**
4496 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4497 */
4498IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4499 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4500{
4501 uint16_t const fFcw = pFpuState->FCW;
4502 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4503 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4504 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4505 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4506 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4507 {
4508 pr64Dst->s64.fSign = pr80Src->s.fSign;
4509 pr64Dst->s64.uExponent = 0;
4510 pr64Dst->s64.uFraction = 0;
4511 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4512 }
4513 else if (RTFLOAT80U_IS_INF(pr80Src))
4514 {
4515 pr64Dst->s64.fSign = pr80Src->s.fSign;
4516 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4517 pr64Dst->s64.uFraction = 0;
4518 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4519 }
4520 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4521 {
4522 /* Mapped to +/-QNaN */
4523 pr64Dst->s64.fSign = pr80Src->s.fSign;
4524 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4525 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4526 }
4527 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4528 {
4529 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4530 if (fFcw & X86_FCW_IM)
4531 {
4532 pr64Dst->s64.fSign = 1;
4533 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4534 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4535 fFsw |= X86_FSW_IE;
4536 }
4537 else
4538 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4539 }
4540 else if (RTFLOAT80U_IS_NAN(pr80Src))
4541 {
4542 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4543 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4544 {
4545 pr64Dst->s64.fSign = pr80Src->s.fSign;
4546 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4547 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4548 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4549 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4550 fFsw |= X86_FSW_IE;
4551 }
4552 else
4553 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4554 }
4555 else
4556 {
4557 /* Denormal values causes both an underflow and precision exception. */
4558 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4559 if (fFcw & X86_FCW_UM)
4560 {
4561 pr64Dst->s64.fSign = pr80Src->s.fSign;
4562 pr64Dst->s64.uExponent = 0;
4563 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4564 {
4565 pr64Dst->s64.uFraction = 1;
4566 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4567 if (!(fFcw & X86_FCW_PM))
4568 fFsw |= X86_FSW_ES | X86_FSW_B;
4569 }
4570 else
4571 {
4572 pr64Dst->s64.uFraction = 0;
4573 fFsw |= X86_FSW_UE | X86_FSW_PE;
4574 if (!(fFcw & X86_FCW_PM))
4575 fFsw |= X86_FSW_ES | X86_FSW_B;
4576 }
4577 }
4578 else
4579 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4580 }
4581 *pu16FSW = fFsw;
4582}
4583
4584
4585IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4586 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4587{
4588 /*
4589 * FPU status word:
4590 * - TOP is irrelevant, but we must match x86 assembly version (0).
4591 * - C1 is always cleared as we don't have any stack overflows.
4592 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4593 */
4594 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4595 *pr80Dst = *pr80Src;
4596}
4597
4598
4599/*
4600 *
4601 * Mantissa:
4602 * 63 56 48 40 32 24 16 8 0
4603 * v v v v v v v v v
4604 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4605 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4606 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4607 *
4608 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4609 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4610 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4611 * where we'll drop off all but bit 63.
4612 */
4613#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4614IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4615 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4616{ \
4617 uint16_t const fFcw = pFpuState->FCW; \
4618 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4619 bool const fSignIn = pr80Val->s.fSign; \
4620 \
4621 /* \
4622 * Deal with normal numbers first. \
4623 */ \
4624 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4625 { \
4626 uint64_t uMantissa = pr80Val->s.uMantissa; \
4627 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4628 \
4629 if ((uint32_t)iExponent <= a_cBits - 2) \
4630 { \
4631 unsigned const cShiftOff = 63 - iExponent; \
4632 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4633 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4634 ? RT_BIT_64(cShiftOff - 1) \
4635 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4636 ? fRoundingOffMask \
4637 : 0; \
4638 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4639 \
4640 uMantissa >>= cShiftOff; \
4641 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4642 uMantissa += uRounding; \
4643 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4644 { \
4645 if (fRoundedOff) \
4646 { \
4647 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4648 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4649 else if (uRounding) \
4650 fFsw |= X86_FSW_C1; \
4651 fFsw |= X86_FSW_PE; \
4652 if (!(fFcw & X86_FCW_PM)) \
4653 fFsw |= X86_FSW_ES | X86_FSW_B; \
4654 } \
4655 \
4656 if (!fSignIn) \
4657 *piDst = (a_iType)uMantissa; \
4658 else \
4659 *piDst = -(a_iType)uMantissa; \
4660 } \
4661 else \
4662 { \
4663 /* overflowed after rounding. */ \
4664 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4665 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4666 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4667 \
4668 /* Special case for the integer minimum value. */ \
4669 if (fSignIn) \
4670 { \
4671 *piDst = a_iTypeMin; \
4672 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4673 if (!(fFcw & X86_FCW_PM)) \
4674 fFsw |= X86_FSW_ES | X86_FSW_B; \
4675 } \
4676 else \
4677 { \
4678 fFsw |= X86_FSW_IE; \
4679 if (fFcw & X86_FCW_IM) \
4680 *piDst = a_iTypeMin; \
4681 else \
4682 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4683 } \
4684 } \
4685 } \
4686 /* \
4687 * Tiny sub-zero numbers. \
4688 */ \
4689 else if (iExponent < 0) \
4690 { \
4691 if (!fSignIn) \
4692 { \
4693 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4694 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4695 { \
4696 *piDst = 1; \
4697 fFsw |= X86_FSW_C1; \
4698 } \
4699 else \
4700 *piDst = 0; \
4701 } \
4702 else \
4703 { \
4704 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4705 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4706 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4707 *piDst = 0; \
4708 else \
4709 { \
4710 *piDst = -1; \
4711 fFsw |= X86_FSW_C1; \
4712 } \
4713 } \
4714 fFsw |= X86_FSW_PE; \
4715 if (!(fFcw & X86_FCW_PM)) \
4716 fFsw |= X86_FSW_ES | X86_FSW_B; \
4717 } \
4718 /* \
4719 * Special MIN case. \
4720 */ \
4721 else if ( fSignIn && iExponent == a_cBits - 1 \
4722 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4723 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4724 : uMantissa == RT_BIT_64(63))) \
4725 { \
4726 *piDst = a_iTypeMin; \
4727 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4728 { \
4729 fFsw |= X86_FSW_PE; \
4730 if (!(fFcw & X86_FCW_PM)) \
4731 fFsw |= X86_FSW_ES | X86_FSW_B; \
4732 } \
4733 } \
4734 /* \
4735 * Too large/small number outside the target integer range. \
4736 */ \
4737 else \
4738 { \
4739 fFsw |= X86_FSW_IE; \
4740 if (fFcw & X86_FCW_IM) \
4741 *piDst = a_iTypeIndefinite; \
4742 else \
4743 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4744 } \
4745 } \
4746 /* \
4747 * Map both +0 and -0 to integer zero (signless/+). \
4748 */ \
4749 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4750 *piDst = 0; \
4751 /* \
4752 * Denormals are just really tiny sub-zero numbers that are either rounded \
4753 * to zero, 1 or -1 depending on sign and rounding control. \
4754 */ \
4755 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4756 { \
4757 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4758 *piDst = 0; \
4759 else \
4760 { \
4761 *piDst = fSignIn ? -1 : 1; \
4762 fFsw |= X86_FSW_C1; \
4763 } \
4764 fFsw |= X86_FSW_PE; \
4765 if (!(fFcw & X86_FCW_PM)) \
4766 fFsw |= X86_FSW_ES | X86_FSW_B; \
4767 } \
4768 /* \
4769 * All other special values are considered invalid arguments and result \
4770 * in an IE exception and indefinite value if masked. \
4771 */ \
4772 else \
4773 { \
4774 fFsw |= X86_FSW_IE; \
4775 if (fFcw & X86_FCW_IM) \
4776 *piDst = a_iTypeIndefinite; \
4777 else \
4778 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4779 } \
4780 *pu16FSW = fFsw; \
4781}
4782EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4783EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4784EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4785
4786#endif /*IEM_WITHOUT_ASSEMBLY */
4787
4788
4789/*
4790 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4791 *
4792 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4793 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4794 * thus the @a a_cBitsIn.
4795 */
4796#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4797IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4798 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4799{ \
4800 uint16_t const fFcw = pFpuState->FCW; \
4801 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4802 bool const fSignIn = pr80Val->s.fSign; \
4803 \
4804 /* \
4805 * Deal with normal numbers first. \
4806 */ \
4807 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4808 { \
4809 uint64_t uMantissa = pr80Val->s.uMantissa; \
4810 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4811 \
4812 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4813 { \
4814 unsigned const cShiftOff = 63 - iExponent; \
4815 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4816 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4817 uMantissa >>= cShiftOff; \
4818 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4819 if (!fSignIn) \
4820 *piDst = (a_iType)uMantissa; \
4821 else \
4822 *piDst = -(a_iType)uMantissa; \
4823 \
4824 if (fRoundedOff) \
4825 { \
4826 fFsw |= X86_FSW_PE; \
4827 if (!(fFcw & X86_FCW_PM)) \
4828 fFsw |= X86_FSW_ES | X86_FSW_B; \
4829 } \
4830 } \
4831 /* \
4832 * Tiny sub-zero numbers. \
4833 */ \
4834 else if (iExponent < 0) \
4835 { \
4836 *piDst = 0; \
4837 fFsw |= X86_FSW_PE; \
4838 if (!(fFcw & X86_FCW_PM)) \
4839 fFsw |= X86_FSW_ES | X86_FSW_B; \
4840 } \
4841 /* \
4842 * Special MIN case. \
4843 */ \
4844 else if ( fSignIn && iExponent == a_cBits - 1 \
4845 && (a_cBits < 64 \
4846 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4847 : uMantissa == RT_BIT_64(63)) ) \
4848 { \
4849 *piDst = a_iTypeMin; \
4850 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4851 { \
4852 fFsw |= X86_FSW_PE; \
4853 if (!(fFcw & X86_FCW_PM)) \
4854 fFsw |= X86_FSW_ES | X86_FSW_B; \
4855 } \
4856 } \
4857 /* \
4858 * Figure this weirdness. \
4859 */ \
4860 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4861 { \
4862 *piDst = 0; \
4863 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4864 { \
4865 fFsw |= X86_FSW_PE; \
4866 if (!(fFcw & X86_FCW_PM)) \
4867 fFsw |= X86_FSW_ES | X86_FSW_B; \
4868 } \
4869 } \
4870 /* \
4871 * Too large/small number outside the target integer range. \
4872 */ \
4873 else \
4874 { \
4875 fFsw |= X86_FSW_IE; \
4876 if (fFcw & X86_FCW_IM) \
4877 *piDst = a_iTypeIndefinite; \
4878 else \
4879 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4880 } \
4881 } \
4882 /* \
4883 * Map both +0 and -0 to integer zero (signless/+). \
4884 */ \
4885 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4886 *piDst = 0; \
4887 /* \
4888 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4889 */ \
4890 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4891 { \
4892 *piDst = 0; \
4893 fFsw |= X86_FSW_PE; \
4894 if (!(fFcw & X86_FCW_PM)) \
4895 fFsw |= X86_FSW_ES | X86_FSW_B; \
4896 } \
4897 /* \
4898 * All other special values are considered invalid arguments and result \
4899 * in an IE exception and indefinite value if masked. \
4900 */ \
4901 else \
4902 { \
4903 fFsw |= X86_FSW_IE; \
4904 if (fFcw & X86_FCW_IM) \
4905 *piDst = a_iTypeIndefinite; \
4906 else \
4907 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4908 } \
4909 *pu16FSW = fFsw; \
4910}
4911#if defined(IEM_WITHOUT_ASSEMBLY)
4912EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4913EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4914EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4915#endif
4916EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4917EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4918
4919
4920#if defined(IEM_WITHOUT_ASSEMBLY)
4921
4922IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4923 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4924{
4925 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4926 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4927 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4928 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4929 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4930
4931 uint16_t const fFcw = pFpuState->FCW;
4932 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4933 bool const fSignIn = pr80Src->s.fSign;
4934
4935 /*
4936 * Deal with normal numbers first.
4937 */
4938 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4939 {
4940 uint64_t uMantissa = pr80Src->s.uMantissa;
4941 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4942 if ( (uint32_t)iExponent <= 58
4943 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4944 {
4945 unsigned const cShiftOff = 63 - iExponent;
4946 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4947 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4948 ? RT_BIT_64(cShiftOff - 1)
4949 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4950 ? fRoundingOffMask
4951 : 0;
4952 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4953
4954 uMantissa >>= cShiftOff;
4955 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4956 uMantissa += uRounding;
4957 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4958 {
4959 if (fRoundedOff)
4960 {
4961 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4962 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4963 else if (uRounding)
4964 fFsw |= X86_FSW_C1;
4965 fFsw |= X86_FSW_PE;
4966 if (!(fFcw & X86_FCW_PM))
4967 fFsw |= X86_FSW_ES | X86_FSW_B;
4968 }
4969
4970 pd80Dst->s.fSign = fSignIn;
4971 pd80Dst->s.uPad = 0;
4972 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4973 {
4974 unsigned const uDigits = uMantissa % 100;
4975 uMantissa /= 100;
4976 uint8_t const bLo = uDigits % 10;
4977 uint8_t const bHi = uDigits / 10;
4978 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4979 }
4980 }
4981 else
4982 {
4983 /* overflowed after rounding. */
4984 fFsw |= X86_FSW_IE;
4985 if (fFcw & X86_FCW_IM)
4986 *pd80Dst = s_d80Indefinite;
4987 else
4988 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4989 }
4990 }
4991 /*
4992 * Tiny sub-zero numbers.
4993 */
4994 else if (iExponent < 0)
4995 {
4996 if (!fSignIn)
4997 {
4998 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4999 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5000 {
5001 *pd80Dst = s_ad80One[fSignIn];
5002 fFsw |= X86_FSW_C1;
5003 }
5004 else
5005 *pd80Dst = s_ad80Zeros[fSignIn];
5006 }
5007 else
5008 {
5009 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5010 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5011 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5012 *pd80Dst = s_ad80Zeros[fSignIn];
5013 else
5014 {
5015 *pd80Dst = s_ad80One[fSignIn];
5016 fFsw |= X86_FSW_C1;
5017 }
5018 }
5019 fFsw |= X86_FSW_PE;
5020 if (!(fFcw & X86_FCW_PM))
5021 fFsw |= X86_FSW_ES | X86_FSW_B;
5022 }
5023 /*
5024 * Too large/small number outside the target integer range.
5025 */
5026 else
5027 {
5028 fFsw |= X86_FSW_IE;
5029 if (fFcw & X86_FCW_IM)
5030 *pd80Dst = s_d80Indefinite;
5031 else
5032 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5033 }
5034 }
5035 /*
5036 * Map both +0 and -0 to integer zero (signless/+).
5037 */
5038 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5039 *pd80Dst = s_ad80Zeros[fSignIn];
5040 /*
5041 * Denormals are just really tiny sub-zero numbers that are either rounded
5042 * to zero, 1 or -1 depending on sign and rounding control.
5043 */
5044 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5045 {
5046 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5047 *pd80Dst = s_ad80Zeros[fSignIn];
5048 else
5049 {
5050 *pd80Dst = s_ad80One[fSignIn];
5051 fFsw |= X86_FSW_C1;
5052 }
5053 fFsw |= X86_FSW_PE;
5054 if (!(fFcw & X86_FCW_PM))
5055 fFsw |= X86_FSW_ES | X86_FSW_B;
5056 }
5057 /*
5058 * All other special values are considered invalid arguments and result
5059 * in an IE exception and indefinite value if masked.
5060 */
5061 else
5062 {
5063 fFsw |= X86_FSW_IE;
5064 if (fFcw & X86_FCW_IM)
5065 *pd80Dst = s_d80Indefinite;
5066 else
5067 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5068 }
5069 *pu16FSW = fFsw;
5070}
5071
5072
5073/*********************************************************************************************************************************
5074* FPU Helpers *
5075*********************************************************************************************************************************/
5076AssertCompileSize(RTFLOAT128U, 16);
5077AssertCompileSize(RTFLOAT80U, 10);
5078AssertCompileSize(RTFLOAT64U, 8);
5079AssertCompileSize(RTFLOAT32U, 4);
5080
5081/**
5082 * Normalizes a possible pseudo-normal value.
5083 *
5084 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5085 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5086 * i.e. changing uExponent from 0 to 1.
5087 *
5088 * This macro will declare a RTFLOAT80U with the name given by
5089 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5090 * a normalization was performed.
5091 *
5092 * @note This must be applied before calling SoftFloat with a value that couldbe
5093 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5094 * correctly.
5095 */
5096#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5097 RTFLOAT80U a_r80ValNormalized; \
5098 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5099 { \
5100 a_r80ValNormalized = *a_pr80Val; \
5101 a_r80ValNormalized.s.uExponent = 1; \
5102 a_pr80Val = &a_r80ValNormalized; \
5103 } else do {} while (0)
5104
5105#ifdef IEM_WITH_FLOAT128_FOR_FPU
5106
5107DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5108{
5109 int fNew;
5110 switch (fFcw & X86_FCW_RC_MASK)
5111 {
5112 default:
5113 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5114 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5115 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5116 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5117 }
5118 int fOld = fegetround();
5119 fesetround(fNew);
5120 return fOld;
5121}
5122
5123
5124DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5125{
5126 fesetround(fOld);
5127}
5128
5129DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5130{
5131 RT_NOREF(fFcw);
5132 RTFLOAT128U Tmp;
5133 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5134 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5135 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5136 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5137 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5138 {
5139 Assert(Tmp.s.uExponent == 0);
5140 Tmp.s2.uSignAndExponent++;
5141 }
5142 return *(_Float128 *)&Tmp;
5143}
5144
5145
5146DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5147{
5148 RT_NOREF(fFcw);
5149 RTFLOAT128U Tmp;
5150 *(_Float128 *)&Tmp = rd128ValSrc;
5151 ASMCompilerBarrier();
5152 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5153 {
5154 pr80Dst->s.fSign = Tmp.s64.fSign;
5155 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5156 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5157 | Tmp.s64.uFractionLo >> (64 - 15);
5158
5159 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5160 unsigned const cShiftOff = 64 - 15;
5161 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5162 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5163 if (uRoundedOff)
5164 {
5165 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5166 ? RT_BIT_64(cShiftOff - 1)
5167 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5168 ? fRoundingOffMask
5169 : 0;
5170 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5171 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5172 || uRoundedOff != uRoundingAdd)
5173 {
5174 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5175 {
5176 uFraction += 1;
5177 if (!(uFraction & RT_BIT_64(63)))
5178 { /* likely */ }
5179 else
5180 {
5181 uFraction >>= 1;
5182 pr80Dst->s.uExponent++;
5183 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5184 return fFsw;
5185 }
5186 fFsw |= X86_FSW_C1;
5187 }
5188 }
5189 fFsw |= X86_FSW_PE;
5190 if (!(fFcw & X86_FCW_PM))
5191 fFsw |= X86_FSW_ES | X86_FSW_B;
5192 }
5193 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5194 }
5195 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = 0;
5199 pr80Dst->s.uMantissa = 0;
5200 }
5201 else if (RTFLOAT128U_IS_INF(&Tmp))
5202 {
5203 pr80Dst->s.fSign = Tmp.s64.fSign;
5204 pr80Dst->s.uExponent = 0;
5205 pr80Dst->s.uMantissa = 0;
5206 }
5207 return fFsw;
5208}
5209
5210
5211#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5212
5213/** Initializer for the SoftFloat state structure. */
5214# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5215 { \
5216 softfloat_tininess_afterRounding, \
5217 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5218 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5219 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5220 : (uint8_t)softfloat_round_minMag, \
5221 0, \
5222 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5223 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5224 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5225 }
5226
5227/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5228# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5229 ( (a_fFsw) \
5230 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5231 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5232 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5233 ? X86_FSW_ES | X86_FSW_B : 0) )
5234
5235
5236DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5237{
5238 RT_NOREF(fFcw);
5239 Assert(cBits > 64);
5240# if 0 /* rounding does not seem to help */
5241 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5242 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5243 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5244 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5245 {
5246 uint64_t uOld = r128.v[0];
5247 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5248 if (r128.v[0] < uOld)
5249 r128.v[1] += 1;
5250 }
5251# else
5252 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5253# endif
5254 return r128;
5255}
5256
5257
5258DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5259{
5260 RT_NOREF(fFcw);
5261 Assert(cBits > 64);
5262# if 0 /* rounding does not seem to help, not even on constants */
5263 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5264 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5265 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5266 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5267 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5268 {
5269 uint64_t uOld = r128.v[0];
5270 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5271 if (r128.v[0] < uOld)
5272 r128.v[1] += 1;
5273 }
5274 return r128;
5275# else
5276 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5277 return r128;
5278# endif
5279}
5280
5281
5282# if 0 /* unused */
5283DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5284{
5285 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5286 return r128;
5287}
5288# endif
5289
5290
5291/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5292DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5293{
5294 extFloat80_t Tmp;
5295 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5296 Tmp.signif = pr80Val->s2.uMantissa;
5297 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5298 return extF80_to_f128(Tmp, &Ignored);
5299}
5300
5301
5302/**
5303 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5304 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5305 *
5306 * This is only a structure format conversion, nothing else.
5307 */
5308DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5309{
5310 extFloat80_t Tmp;
5311 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5312 Tmp.signif = pr80Val->s2.uMantissa;
5313 return Tmp;
5314}
5315
5316
5317/**
5318 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5319 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5320 *
5321 * This is only a structure format conversion, nothing else.
5322 */
5323DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5324{
5325 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5326 pr80Dst->s2.uMantissa = r80XSrc.signif;
5327 return pr80Dst;
5328}
5329
5330
5331DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5332{
5333 RT_NOREF(fFcw);
5334 RTFLOAT128U Tmp;
5335 *(float128_t *)&Tmp = r128Src;
5336 ASMCompilerBarrier();
5337
5338 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5339 {
5340 pr80Dst->s.fSign = Tmp.s64.fSign;
5341 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5342 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5343 | Tmp.s64.uFractionLo >> (64 - 15);
5344
5345 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5346 unsigned const cShiftOff = 64 - 15;
5347 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5348 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5349 if (uRoundedOff)
5350 {
5351 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5352 ? RT_BIT_64(cShiftOff - 1)
5353 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5354 ? fRoundingOffMask
5355 : 0;
5356 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5357 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5358 || uRoundedOff != uRoundingAdd)
5359 {
5360 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5361 {
5362 uFraction += 1;
5363 if (!(uFraction & RT_BIT_64(63)))
5364 { /* likely */ }
5365 else
5366 {
5367 uFraction >>= 1;
5368 pr80Dst->s.uExponent++;
5369 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5370 return fFsw;
5371 }
5372 fFsw |= X86_FSW_C1;
5373 }
5374 }
5375 fFsw |= X86_FSW_PE;
5376 if (!(fFcw & X86_FCW_PM))
5377 fFsw |= X86_FSW_ES | X86_FSW_B;
5378 }
5379
5380 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5381 }
5382 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5383 {
5384 pr80Dst->s.fSign = Tmp.s64.fSign;
5385 pr80Dst->s.uExponent = 0;
5386 pr80Dst->s.uMantissa = 0;
5387 }
5388 else if (RTFLOAT128U_IS_INF(&Tmp))
5389 {
5390 pr80Dst->s.fSign = Tmp.s64.fSign;
5391 pr80Dst->s.uExponent = 0x7fff;
5392 pr80Dst->s.uMantissa = 0;
5393 }
5394 return fFsw;
5395}
5396
5397
5398/**
5399 * Helper for transfering exception and C1 to FSW and setting the result value
5400 * accordingly.
5401 *
5402 * @returns Updated FSW.
5403 * @param pSoftState The SoftFloat state following the operation.
5404 * @param r80XResult The result of the SoftFloat operation.
5405 * @param pr80Result Where to store the result for IEM.
5406 * @param fFcw The FPU control word.
5407 * @param fFsw The FSW before the operation, with necessary bits
5408 * cleared and such.
5409 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5410 * raised.
5411 */
5412DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5413 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5414 PCRTFLOAT80U pr80XcptResult)
5415{
5416 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5417 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5418 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5419 fFsw |= X86_FSW_ES | X86_FSW_B;
5420
5421 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5422 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5423 else
5424 {
5425 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5426 *pr80Result = *pr80XcptResult;
5427 }
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * Helper doing polynomial evaluation using Horner's method.
5434 *
5435 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5436 */
5437float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5438 unsigned cPrecision, softfloat_state_t *pSoftState)
5439{
5440 Assert(cHornerConsts > 1);
5441 size_t i = cHornerConsts - 1;
5442 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5443 while (i-- > 0)
5444 {
5445 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5446 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5447 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5448 }
5449 return r128Result;
5450}
5451
5452#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5453
5454
5455/**
5456 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5457 * mantissa, exponent and sign.
5458 *
5459 * @returns Updated FSW.
5460 * @param pr80Dst Where to return the composed value.
5461 * @param fSign The sign.
5462 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5463 * ignored and should be zero. This will probably be
5464 * modified during normalization and rounding.
5465 * @param iExponent Unbiased exponent.
5466 * @param fFcw The FPU control word.
5467 * @param fFsw The FPU status word.
5468 */
5469static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5470 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5471{
5472 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5473
5474 iExponent += RTFLOAT80U_EXP_BIAS;
5475
5476 /* Do normalization if necessary and possible. */
5477 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5478 {
5479 int cShift = 192 - RTUInt256BitCount(puMantissa);
5480 if (iExponent > cShift)
5481 iExponent -= cShift;
5482 else
5483 {
5484 if (fFcw & X86_FCW_UM)
5485 {
5486 if (iExponent > 0)
5487 cShift = --iExponent;
5488 else
5489 cShift = 0;
5490 }
5491 iExponent -= cShift;
5492 }
5493 RTUInt256AssignShiftLeft(puMantissa, cShift);
5494 }
5495
5496 /* Do rounding. */
5497 uint64_t uMantissa = puMantissa->QWords.qw2;
5498 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5499 {
5500 bool fAdd;
5501 switch (fFcw & X86_FCW_RC_MASK)
5502 {
5503 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5504 case X86_FCW_RC_NEAREST:
5505 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5506 {
5507 if ( (uMantissa & 1)
5508 || puMantissa->QWords.qw0 != 0
5509 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5510 {
5511 fAdd = true;
5512 break;
5513 }
5514 uMantissa &= ~(uint64_t)1;
5515 }
5516 fAdd = false;
5517 break;
5518 case X86_FCW_RC_ZERO:
5519 fAdd = false;
5520 break;
5521 case X86_FCW_RC_UP:
5522 fAdd = !fSign;
5523 break;
5524 case X86_FCW_RC_DOWN:
5525 fAdd = fSign;
5526 break;
5527 }
5528 if (fAdd)
5529 {
5530 uint64_t const uTmp = uMantissa;
5531 uMantissa = uTmp + 1;
5532 if (uMantissa < uTmp)
5533 {
5534 uMantissa >>= 1;
5535 uMantissa |= RT_BIT_64(63);
5536 iExponent++;
5537 }
5538 fFsw |= X86_FSW_C1;
5539 }
5540 fFsw |= X86_FSW_PE;
5541 if (!(fFcw & X86_FCW_PM))
5542 fFsw |= X86_FSW_ES | X86_FSW_B;
5543 }
5544
5545 /* Check for underflow (denormals). */
5546 if (iExponent <= 0)
5547 {
5548 if (fFcw & X86_FCW_UM)
5549 {
5550 if (uMantissa & RT_BIT_64(63))
5551 uMantissa >>= 1;
5552 iExponent = 0;
5553 }
5554 else
5555 {
5556 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5557 fFsw |= X86_FSW_ES | X86_FSW_B;
5558 }
5559 fFsw |= X86_FSW_UE;
5560 }
5561 /* Check for overflow */
5562 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5563 {
5564 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5565 }
5566
5567 /* Compose the result. */
5568 pr80Dst->s.uMantissa = uMantissa;
5569 pr80Dst->s.uExponent = iExponent;
5570 pr80Dst->s.fSign = fSign;
5571 return fFsw;
5572}
5573
5574
5575/**
5576 * See also iemAImpl_fld_r80_from_r32
5577 */
5578static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5579{
5580 uint16_t fFsw = 0;
5581 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5582 {
5583 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5584 pr80Dst->sj64.fInteger = 1;
5585 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5586 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5587 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5588 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5589 }
5590 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5591 {
5592 pr80Dst->s.fSign = pr32Val->s.fSign;
5593 pr80Dst->s.uExponent = 0;
5594 pr80Dst->s.uMantissa = 0;
5595 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5596 }
5597 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5598 {
5599 /* Subnormal -> normalized + X86_FSW_DE return. */
5600 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5601 pr80Dst->sj64.fInteger = 1;
5602 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5603 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5604 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5605 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5606 fFsw = X86_FSW_DE;
5607 }
5608 else if (RTFLOAT32U_IS_INF(pr32Val))
5609 {
5610 pr80Dst->s.fSign = pr32Val->s.fSign;
5611 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5612 pr80Dst->s.uMantissa = RT_BIT_64(63);
5613 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5614 }
5615 else
5616 {
5617 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5618 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5619 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5620 pr80Dst->sj64.fInteger = 1;
5621 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5622 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5623 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5624 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5625 }
5626 return fFsw;
5627}
5628
5629
5630/**
5631 * See also iemAImpl_fld_r80_from_r64
5632 */
5633static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5634{
5635 uint16_t fFsw = 0;
5636 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5637 {
5638 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5639 pr80Dst->sj64.fInteger = 1;
5640 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5641 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5642 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5643 }
5644 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5645 {
5646 pr80Dst->s.fSign = pr64Val->s.fSign;
5647 pr80Dst->s.uExponent = 0;
5648 pr80Dst->s.uMantissa = 0;
5649 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5650 }
5651 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5652 {
5653 /* Subnormal values gets normalized. */
5654 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5655 pr80Dst->sj64.fInteger = 1;
5656 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5657 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5658 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5659 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5660 fFsw = X86_FSW_DE;
5661 }
5662 else if (RTFLOAT64U_IS_INF(pr64Val))
5663 {
5664 pr80Dst->s.fSign = pr64Val->s.fSign;
5665 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5666 pr80Dst->s.uMantissa = RT_BIT_64(63);
5667 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5668 }
5669 else
5670 {
5671 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5672 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5673 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5674 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5675 pr80Dst->sj64.fInteger = 1;
5676 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5677 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5678 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5679 }
5680 return fFsw;
5681}
5682
5683
5684/**
5685 * See also EMIT_FILD.
5686 */
5687#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5688static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5689{ \
5690 if (iVal == 0) \
5691 { \
5692 pr80Dst->s.fSign = 0; \
5693 pr80Dst->s.uExponent = 0; \
5694 pr80Dst->s.uMantissa = 0; \
5695 } \
5696 else \
5697 { \
5698 if (iVal > 0) \
5699 pr80Dst->s.fSign = 0; \
5700 else \
5701 { \
5702 pr80Dst->s.fSign = 1; \
5703 iVal = -iVal; \
5704 } \
5705 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5706 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5707 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5708 } \
5709 return pr80Dst; \
5710}
5711EMIT_CONVERT_IXX_TO_R80(16)
5712EMIT_CONVERT_IXX_TO_R80(32)
5713//EMIT_CONVERT_IXX_TO_R80(64)
5714
5715/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5716#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5717IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5718{ \
5719 RTFLOAT80U r80Val2; \
5720 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5721 Assert(!fFsw || fFsw == X86_FSW_DE); \
5722 if (fFsw) \
5723 { \
5724 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5725 fFsw = 0; \
5726 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5727 { \
5728 pFpuRes->r80Result = *pr80Val1; \
5729 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5730 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5731 return; \
5732 } \
5733 } \
5734 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5735 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5736}
5737
5738/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5739#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5740IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5741{ \
5742 RTFLOAT80U r80Val2; \
5743 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5744 Assert(!fFsw || fFsw == X86_FSW_DE); \
5745 if (fFsw) \
5746 { \
5747 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5748 fFsw = 0; \
5749 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5750 { \
5751 pFpuRes->r80Result = *pr80Val1; \
5752 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5753 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5754 return; \
5755 } \
5756 } \
5757 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5758 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5759}
5760
5761/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5762#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5763IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5764{ \
5765 RTFLOAT80U r80Val2; \
5766 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5767 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5768}
5769
5770/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5771#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5772IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5773{ \
5774 RTFLOAT80U r80Val2; \
5775 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5776 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5777}
5778
5779
5780
5781/*********************************************************************************************************************************
5782* x86 FPU Division Operations *
5783*********************************************************************************************************************************/
5784
5785/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5786static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5787 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5788{
5789 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5790 {
5791 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5792 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5793 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5794 }
5795 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5796 { /* Div by zero. */
5797 if (fFcw & X86_FCW_ZM)
5798 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5799 else
5800 {
5801 *pr80Result = *pr80Val1Org;
5802 fFsw |= X86_FSW_ES | X86_FSW_B;
5803 }
5804 fFsw |= X86_FSW_ZE;
5805 }
5806 else
5807 { /* Invalid operand */
5808 if (fFcw & X86_FCW_IM)
5809 *pr80Result = g_r80Indefinite;
5810 else
5811 {
5812 *pr80Result = *pr80Val1Org;
5813 fFsw |= X86_FSW_ES | X86_FSW_B;
5814 }
5815 fFsw |= X86_FSW_IE;
5816 }
5817 return fFsw;
5818}
5819
5820
5821IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5822 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5823{
5824 uint16_t const fFcw = pFpuState->FCW;
5825 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5826
5827 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5828 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5829 {
5830 if (fFcw & X86_FCW_IM)
5831 pFpuRes->r80Result = g_r80Indefinite;
5832 else
5833 {
5834 pFpuRes->r80Result = *pr80Val1;
5835 fFsw |= X86_FSW_ES | X86_FSW_B;
5836 }
5837 fFsw |= X86_FSW_IE;
5838 }
5839 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5840 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5841 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5842 {
5843 if (fFcw & X86_FCW_DM)
5844 {
5845 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5846 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5847 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5848 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5849 }
5850 else
5851 {
5852 pFpuRes->r80Result = *pr80Val1;
5853 fFsw |= X86_FSW_ES | X86_FSW_B;
5854 }
5855 fFsw |= X86_FSW_DE;
5856 }
5857 /* SoftFloat can handle the rest: */
5858 else
5859 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5860
5861 pFpuRes->FSW = fFsw;
5862}
5863
5864
5865EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5866EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5867EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5868EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5869
5870
5871IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5872 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5873{
5874 uint16_t const fFcw = pFpuState->FCW;
5875 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5876
5877 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5878 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5879 {
5880 if (fFcw & X86_FCW_IM)
5881 pFpuRes->r80Result = g_r80Indefinite;
5882 else
5883 {
5884 pFpuRes->r80Result = *pr80Val1;
5885 fFsw |= X86_FSW_ES | X86_FSW_B;
5886 }
5887 fFsw |= X86_FSW_IE;
5888 }
5889 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5890 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5891 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5892 {
5893 if (fFcw & X86_FCW_DM)
5894 {
5895 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5896 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5897 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5898 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5899 }
5900 else
5901 {
5902 pFpuRes->r80Result = *pr80Val1;
5903 fFsw |= X86_FSW_ES | X86_FSW_B;
5904 }
5905 fFsw |= X86_FSW_DE;
5906 }
5907 /* SoftFloat can handle the rest: */
5908 else
5909 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5910
5911 pFpuRes->FSW = fFsw;
5912}
5913
5914
5915EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5916EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5917EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5918EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5919
5920
5921/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5922static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5923 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5924{
5925 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5926 {
5927 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5928 uint16_t fCxFlags = 0;
5929 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5930 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5931 &fCxFlags, &SoftState);
5932 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5933 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5934 if ( !(fFsw & X86_FSW_IE)
5935 && !RTFLOAT80U_IS_NAN(pr80Result)
5936 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5937 {
5938 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5939 fFsw |= fCxFlags & X86_FSW_C_MASK;
5940 }
5941 return fFsw;
5942 }
5943
5944 /* Invalid operand */
5945 if (fFcw & X86_FCW_IM)
5946 *pr80Result = g_r80Indefinite;
5947 else
5948 {
5949 *pr80Result = *pr80Val1Org;
5950 fFsw |= X86_FSW_ES | X86_FSW_B;
5951 }
5952 return fFsw | X86_FSW_IE;
5953}
5954
5955
5956static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5957 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5958{
5959 uint16_t const fFcw = pFpuState->FCW;
5960 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5961
5962 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5963 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5964 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5965 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5966 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5967 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5968 {
5969 if (fFcw & X86_FCW_IM)
5970 pFpuRes->r80Result = g_r80Indefinite;
5971 else
5972 {
5973 pFpuRes->r80Result = *pr80Val1;
5974 fFsw |= X86_FSW_ES | X86_FSW_B;
5975 }
5976 fFsw |= X86_FSW_IE;
5977 }
5978 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5979 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5980 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5981 {
5982 if (fFcw & X86_FCW_DM)
5983 {
5984 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5985 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5986 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5987 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5988 pr80Val1Org, fLegacyInstr);
5989 }
5990 else
5991 {
5992 pFpuRes->r80Result = *pr80Val1;
5993 fFsw |= X86_FSW_ES | X86_FSW_B;
5994 }
5995 fFsw |= X86_FSW_DE;
5996 }
5997 /* SoftFloat can handle the rest: */
5998 else
5999 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6000 pr80Val1, fLegacyInstr);
6001
6002 pFpuRes->FSW = fFsw;
6003}
6004
6005
6006IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6007 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6008{
6009 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6010}
6011
6012
6013IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6014 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6015{
6016 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6017}
6018
6019
6020/*********************************************************************************************************************************
6021* x87 FPU Multiplication Operations *
6022*********************************************************************************************************************************/
6023
6024/** Worker for iemAImpl_fmul_r80_by_r80. */
6025static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6026 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6027{
6028 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6029 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6030 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6031}
6032
6033
6034IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6035 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6036{
6037 uint16_t const fFcw = pFpuState->FCW;
6038 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6039
6040 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6041 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6042 {
6043 if (fFcw & X86_FCW_IM)
6044 pFpuRes->r80Result = g_r80Indefinite;
6045 else
6046 {
6047 pFpuRes->r80Result = *pr80Val1;
6048 fFsw |= X86_FSW_ES | X86_FSW_B;
6049 }
6050 fFsw |= X86_FSW_IE;
6051 }
6052 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6053 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6054 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6055 {
6056 if (fFcw & X86_FCW_DM)
6057 {
6058 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6059 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6060 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6061 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6062 }
6063 else
6064 {
6065 pFpuRes->r80Result = *pr80Val1;
6066 fFsw |= X86_FSW_ES | X86_FSW_B;
6067 }
6068 fFsw |= X86_FSW_DE;
6069 }
6070 /* SoftFloat can handle the rest: */
6071 else
6072 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6073
6074 pFpuRes->FSW = fFsw;
6075}
6076
6077
6078EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6079EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6080EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6081EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6082
6083
6084/*********************************************************************************************************************************
6085* x87 FPU Addition *
6086*********************************************************************************************************************************/
6087
6088/** Worker for iemAImpl_fadd_r80_by_r80. */
6089static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6090 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6091{
6092 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6093 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6094 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6095}
6096
6097
6098IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6099 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6100{
6101 uint16_t const fFcw = pFpuState->FCW;
6102 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6103
6104 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6105 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6106 {
6107 if (fFcw & X86_FCW_IM)
6108 pFpuRes->r80Result = g_r80Indefinite;
6109 else
6110 {
6111 pFpuRes->r80Result = *pr80Val1;
6112 fFsw |= X86_FSW_ES | X86_FSW_B;
6113 }
6114 fFsw |= X86_FSW_IE;
6115 }
6116 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6117 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6118 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6119 {
6120 if (fFcw & X86_FCW_DM)
6121 {
6122 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6123 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6124 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6125 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6126 }
6127 else
6128 {
6129 pFpuRes->r80Result = *pr80Val1;
6130 fFsw |= X86_FSW_ES | X86_FSW_B;
6131 }
6132 fFsw |= X86_FSW_DE;
6133 }
6134 /* SoftFloat can handle the rest: */
6135 else
6136 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6137
6138 pFpuRes->FSW = fFsw;
6139}
6140
6141
6142EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6143EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6144EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6145EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6146
6147
6148/*********************************************************************************************************************************
6149* x87 FPU Subtraction *
6150*********************************************************************************************************************************/
6151
6152/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6153static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6154 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6155{
6156 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6157 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6158 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6159}
6160
6161
6162IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6163 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6164{
6165 uint16_t const fFcw = pFpuState->FCW;
6166 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6167
6168 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6169 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6170 {
6171 if (fFcw & X86_FCW_IM)
6172 pFpuRes->r80Result = g_r80Indefinite;
6173 else
6174 {
6175 pFpuRes->r80Result = *pr80Val1;
6176 fFsw |= X86_FSW_ES | X86_FSW_B;
6177 }
6178 fFsw |= X86_FSW_IE;
6179 }
6180 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6181 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6182 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6183 {
6184 if (fFcw & X86_FCW_DM)
6185 {
6186 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6187 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6188 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6189 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6190 }
6191 else
6192 {
6193 pFpuRes->r80Result = *pr80Val1;
6194 fFsw |= X86_FSW_ES | X86_FSW_B;
6195 }
6196 fFsw |= X86_FSW_DE;
6197 }
6198 /* SoftFloat can handle the rest: */
6199 else
6200 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6201
6202 pFpuRes->FSW = fFsw;
6203}
6204
6205
6206EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6207EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6208EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6209EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6210
6211
6212/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6213IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6214 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6215{
6216 uint16_t const fFcw = pFpuState->FCW;
6217 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6218
6219 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6220 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6221 {
6222 if (fFcw & X86_FCW_IM)
6223 pFpuRes->r80Result = g_r80Indefinite;
6224 else
6225 {
6226 pFpuRes->r80Result = *pr80Val1;
6227 fFsw |= X86_FSW_ES | X86_FSW_B;
6228 }
6229 fFsw |= X86_FSW_IE;
6230 }
6231 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6232 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6233 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6234 {
6235 if (fFcw & X86_FCW_DM)
6236 {
6237 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6238 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6239 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6240 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6241 }
6242 else
6243 {
6244 pFpuRes->r80Result = *pr80Val1;
6245 fFsw |= X86_FSW_ES | X86_FSW_B;
6246 }
6247 fFsw |= X86_FSW_DE;
6248 }
6249 /* SoftFloat can handle the rest: */
6250 else
6251 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6252
6253 pFpuRes->FSW = fFsw;
6254}
6255
6256
6257EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6258EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6259EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6260EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6261
6262
6263/*********************************************************************************************************************************
6264* x87 FPU Trigometric Operations *
6265*********************************************************************************************************************************/
6266static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6267{
6268 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6269 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6270 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6271 extFloat80_t v;
6272 (void)fFcw;
6273
6274 v = extF80_atan2(y, x, &SoftState);
6275
6276 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6277 return fFsw;
6278}
6279
6280IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6281 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6282{
6283 uint16_t const fFcw = pFpuState->FCW;
6284 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6285
6286 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6287 {
6288 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6289
6290 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6291 if (!(fFcw & X86_FCW_PM))
6292 fFsw |= X86_FSW_ES | X86_FSW_B;
6293 }
6294 else
6295 {
6296 fFsw |= X86_FSW_IE;
6297 if (!(fFcw & X86_FCW_IM))
6298 {
6299 pFpuRes->r80Result = *pr80Val2;
6300 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6301 }
6302 else
6303 {
6304 pFpuRes->r80Result = g_r80Indefinite;
6305 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6306 }
6307 }
6308
6309 pFpuRes->FSW = fFsw;
6310}
6311#endif /* IEM_WITHOUT_ASSEMBLY */
6312
6313IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6314 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6315{
6316 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6317}
6318
6319IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6320 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6321{
6322 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6323}
6324
6325
6326#if defined(IEM_WITHOUT_ASSEMBLY)
6327static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6328{
6329 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6330 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6331 extFloat80_t v;
6332 (void)fFcw;
6333
6334 v = extF80_tan(x, &SoftState);
6335
6336 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6337 return fFsw;
6338}
6339
6340IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6341{
6342 uint16_t const fFcw = pFpuState->FCW;
6343 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6344
6345 if (RTFLOAT80U_IS_ZERO(pr80Val))
6346 {
6347 pFpuResTwo->r80Result1 = *pr80Val;
6348 pFpuResTwo->r80Result2 = g_ar80One[0];
6349 }
6350 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6351 {
6352 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6353 {
6354 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6355 pFpuResTwo->r80Result1 = *pr80Val;
6356 }
6357 else
6358 {
6359 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6360 {
6361 pFpuResTwo->r80Result1 = *pr80Val;
6362 }
6363 else
6364 {
6365 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6366 }
6367
6368 pFpuResTwo->r80Result2 = g_ar80One[0];
6369
6370 fFsw |= X86_FSW_PE;
6371 if (!(fFcw & X86_FCW_PM))
6372 fFsw |= X86_FSW_ES | X86_FSW_B;
6373 }
6374 }
6375 else
6376 {
6377 fFsw |= X86_FSW_IE;
6378 if (!(fFcw & X86_FCW_IM))
6379 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6380 }
6381
6382 pFpuResTwo->FSW = fFsw;
6383}
6384#endif /* IEM_WITHOUT_ASSEMBLY */
6385
6386IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6387{
6388 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6389}
6390
6391IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6392{
6393 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6394}
6395
6396#ifdef IEM_WITHOUT_ASSEMBLY
6397
6398static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6399{
6400 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6401 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6402 extFloat80_t v;
6403 (void)fFcw;
6404
6405 v = extF80_sin(x, &SoftState);
6406
6407 iemFpuSoftF80ToIprt(pr80Result, v);
6408
6409 return fFsw;
6410}
6411
6412IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6413{
6414 uint16_t const fFcw = pFpuState->FCW;
6415 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6416
6417 if (RTFLOAT80U_IS_ZERO(pr80Val))
6418 {
6419 pFpuRes->r80Result = *pr80Val;
6420 }
6421 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6422 {
6423 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6424 {
6425 fFsw |= X86_FSW_C2;
6426 pFpuRes->r80Result = *pr80Val;
6427 }
6428 else
6429 {
6430 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6431 {
6432 pFpuRes->r80Result = *pr80Val;
6433 }
6434 else
6435 {
6436 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6437 }
6438 fFsw |= X86_FSW_PE;
6439 if (!(fFcw & X86_FCW_PM))
6440 fFsw |= X86_FSW_ES | X86_FSW_B;
6441 }
6442 }
6443 else if (RTFLOAT80U_IS_INF(pr80Val))
6444 {
6445 fFsw |= X86_FSW_IE;
6446 if (!(fFcw & X86_FCW_IM))
6447 {
6448 fFsw |= X86_FSW_ES | X86_FSW_B;
6449 pFpuRes->r80Result = *pr80Val;
6450 }
6451 else
6452 {
6453 pFpuRes->r80Result = g_r80Indefinite;
6454 }
6455 }
6456 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6457 {
6458 fFsw |= X86_FSW_DE;
6459
6460 if (fFcw & X86_FCW_DM)
6461 {
6462 if (fFcw & X86_FCW_UM)
6463 {
6464 pFpuRes->r80Result = *pr80Val;
6465 }
6466 else
6467 {
6468 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6469 uint64_t uMantissa = pr80Val->s.uMantissa;
6470 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6471
6472 uExponent = 64 - uExponent;
6473 uMantissa <<= uExponent;
6474 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6475
6476 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6477 pFpuRes->r80Result.s.uMantissa = uMantissa;
6478 pFpuRes->r80Result.s.uExponent = uExponent;
6479 }
6480
6481 fFsw |= X86_FSW_UE | X86_FSW_PE;
6482
6483 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6484 {
6485 /* All the exceptions are masked. */
6486 }
6487 else
6488 {
6489 fFsw |= X86_FSW_ES | X86_FSW_B;
6490 }
6491 }
6492 else
6493 {
6494 pFpuRes->r80Result = *pr80Val;
6495
6496 fFsw |= X86_FSW_ES | X86_FSW_B;
6497 }
6498 }
6499 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6500 {
6501 pFpuRes->r80Result = *pr80Val;
6502 fFsw |= X86_FSW_DE;
6503
6504 if (fFcw & X86_FCW_DM)
6505 {
6506 if (fFcw & X86_FCW_PM)
6507 {
6508 fFsw |= X86_FSW_PE;
6509 }
6510 else
6511 {
6512 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6513 }
6514
6515 pFpuRes->r80Result.sj64.uExponent = 1;
6516 }
6517 else
6518 {
6519 fFsw |= X86_FSW_ES | X86_FSW_B;
6520 }
6521 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6522 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6523 {
6524 pFpuRes->r80Result = *pr80Val;
6525 } else {
6526 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6527 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6528 && (fFcw & X86_FCW_IM))
6529 pFpuRes->r80Result = g_r80Indefinite;
6530 else
6531 {
6532 pFpuRes->r80Result = *pr80Val;
6533 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6534 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6535 }
6536
6537 fFsw |= X86_FSW_IE;
6538 if (!(fFcw & X86_FCW_IM))
6539 fFsw |= X86_FSW_ES | X86_FSW_B;
6540 }
6541
6542 pFpuRes->FSW = fFsw;
6543}
6544#endif /* IEM_WITHOUT_ASSEMBLY */
6545
6546IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6547{
6548 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6549}
6550
6551IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6552{
6553 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6554}
6555
6556#ifdef IEM_WITHOUT_ASSEMBLY
6557
6558static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6559{
6560 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6561 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6562 extFloat80_t v;
6563 (void)fFcw;
6564
6565 v = extF80_cos(x, &SoftState);
6566
6567 iemFpuSoftF80ToIprt(pr80Result, v);
6568
6569 return fFsw;
6570}
6571
6572IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6573{
6574 uint16_t const fFcw = pFpuState->FCW;
6575 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6576
6577 if (RTFLOAT80U_IS_ZERO(pr80Val))
6578 {
6579 pFpuRes->r80Result = g_ar80One[0];
6580 }
6581 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6582 {
6583 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6584 {
6585 fFsw |= X86_FSW_C2;
6586 pFpuRes->r80Result = *pr80Val;
6587 }
6588 else
6589 {
6590 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6591 {
6592 pFpuRes->r80Result = g_ar80One[0];
6593
6594 }
6595 else
6596 {
6597 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6598 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6599 }
6600 fFsw |= X86_FSW_PE;
6601 if (!(fFcw & X86_FCW_PM))
6602 fFsw |= X86_FSW_ES | X86_FSW_B;
6603 }
6604 }
6605 else if (RTFLOAT80U_IS_INF(pr80Val))
6606 {
6607 fFsw |= X86_FSW_IE;
6608 if (!(fFcw & X86_FCW_IM))
6609 {
6610 fFsw |= X86_FSW_ES | X86_FSW_B;
6611 pFpuRes->r80Result = *pr80Val;
6612 }
6613 else
6614 {
6615 pFpuRes->r80Result = g_r80Indefinite;
6616 }
6617 }
6618 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6619 {
6620 fFsw |= X86_FSW_DE;
6621
6622 if (fFcw & X86_FCW_DM)
6623 {
6624 pFpuRes->r80Result = g_ar80One[0];
6625
6626 if (fFcw & X86_FCW_PM)
6627 {
6628 fFsw |= X86_FSW_PE;
6629 }
6630 else
6631 {
6632 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6633 }
6634 }
6635 else
6636 {
6637 pFpuRes->r80Result = *pr80Val;
6638 fFsw |= X86_FSW_ES | X86_FSW_B;
6639 }
6640 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6641 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6642 {
6643 pFpuRes->r80Result = *pr80Val;
6644 } else {
6645 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6646 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6647 && (fFcw & X86_FCW_IM))
6648 pFpuRes->r80Result = g_r80Indefinite;
6649 else
6650 {
6651 pFpuRes->r80Result = *pr80Val;
6652 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6653 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6654 }
6655
6656 fFsw |= X86_FSW_IE;
6657 if (!(fFcw & X86_FCW_IM))
6658 fFsw |= X86_FSW_ES | X86_FSW_B;
6659 }
6660
6661 pFpuRes->FSW = fFsw;
6662}
6663#endif /* IEM_WITHOUT_ASSEMBLY */
6664
6665IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6666{
6667 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6668}
6669
6670IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6671{
6672 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6673}
6674
6675#ifdef IEM_WITHOUT_ASSEMBLY
6676
6677static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6678{
6679 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6680 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6681 extFloat80_t r80Sin, r80Cos;
6682 (void)fFcw;
6683
6684 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6685
6686 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6687 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6688
6689 return fFsw;
6690}
6691
6692IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6693{
6694 uint16_t const fFcw = pFpuState->FCW;
6695 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6696
6697 if (RTFLOAT80U_IS_ZERO(pr80Val))
6698 {
6699 pFpuResTwo->r80Result1 = *pr80Val;
6700 pFpuResTwo->r80Result2 = g_ar80One[0];
6701 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6702 }
6703 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6704 {
6705 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6706 {
6707 fFsw |= X86_FSW_C2;
6708
6709 if (fFcw & X86_FCW_IM)
6710 {
6711 pFpuResTwo->r80Result1 = g_r80Indefinite;
6712 }
6713 else
6714 {
6715 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6716 }
6717
6718 pFpuResTwo->r80Result2 = *pr80Val;
6719 }
6720 else
6721 {
6722 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6723
6724 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6725 {
6726 pFpuResTwo->r80Result1 = *pr80Val;
6727 pFpuResTwo->r80Result2 = g_ar80One[0];
6728 }
6729 else
6730 {
6731 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6732 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6733 }
6734 fFsw |= X86_FSW_PE;
6735 if (!(fFcw & X86_FCW_PM))
6736 fFsw |= X86_FSW_ES | X86_FSW_B;
6737 }
6738 }
6739 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6740 {
6741 fFsw |= X86_FSW_DE;
6742
6743 if (fFcw & X86_FCW_DM)
6744 {
6745 pFpuResTwo->r80Result1 = *pr80Val;
6746 pFpuResTwo->r80Result2 = g_ar80One[0];
6747 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6748
6749 if (fFcw & X86_FCW_PM)
6750 {
6751 fFsw |= X86_FSW_PE;
6752 }
6753 else
6754 {
6755 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6756 }
6757
6758 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6759 }
6760 else
6761 {
6762 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6763 pFpuResTwo->r80Result2 = *pr80Val;
6764 fFsw |= X86_FSW_ES | X86_FSW_B;
6765 }
6766 }
6767 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6768 {
6769 fFsw |= X86_FSW_DE;
6770
6771 if (fFcw & X86_FCW_DM)
6772 {
6773 pFpuResTwo->r80Result2 = g_ar80One[0];
6774
6775 if (fFcw & X86_FCW_UM)
6776 {
6777 pFpuResTwo->r80Result1 = *pr80Val;
6778 }
6779 else
6780 {
6781 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6782 uint64_t uMantissa = pr80Val->s.uMantissa;
6783 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6784
6785 uExponent = 64 - uExponent;
6786 uMantissa <<= uExponent;
6787 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6788
6789 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6790 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6791 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6792 }
6793
6794 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6795 fFsw |= X86_FSW_UE | X86_FSW_PE;
6796
6797 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6798 {
6799 /* All the exceptions are masked. */
6800 }
6801 else
6802 {
6803 fFsw |= X86_FSW_ES | X86_FSW_B;
6804 }
6805 }
6806 else
6807 {
6808 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6809 pFpuResTwo->r80Result2 = *pr80Val;
6810 fFsw |= X86_FSW_ES | X86_FSW_B;
6811 }
6812 }
6813 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6814 {
6815 pFpuResTwo->r80Result1 = *pr80Val;
6816 pFpuResTwo->r80Result2 = *pr80Val;
6817 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6818 }
6819 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6820 {
6821 if (fFcw & X86_FCW_IM)
6822 {
6823 pFpuResTwo->r80Result1 = g_r80Indefinite;
6824 pFpuResTwo->r80Result2 = g_r80Indefinite;
6825 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6826 }
6827 else
6828 {
6829 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6830 pFpuResTwo->r80Result2 = *pr80Val;
6831 }
6832
6833 fFsw |= X86_FSW_IE;
6834 if (!(fFcw & X86_FCW_IM))
6835 fFsw |= X86_FSW_ES | X86_FSW_B;
6836 }
6837 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6838 {
6839 pFpuResTwo->r80Result1 = *pr80Val;
6840 pFpuResTwo->r80Result2 = *pr80Val;
6841
6842 if (fFcw & X86_FCW_IM)
6843 {
6844 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6845 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6846 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6847 }
6848 else
6849 {
6850 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6851 pFpuResTwo->r80Result2 = *pr80Val;
6852 }
6853
6854 fFsw |= X86_FSW_IE;
6855 if (!(fFcw & X86_FCW_IM))
6856 fFsw |= X86_FSW_ES | X86_FSW_B;
6857 }
6858 else if (RTFLOAT80U_IS_INF(pr80Val))
6859 {
6860 if (fFcw & X86_FCW_IM)
6861 {
6862 pFpuResTwo->r80Result1 = g_r80Indefinite;
6863 pFpuResTwo->r80Result2 = g_r80Indefinite;
6864 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6865 }
6866 else
6867 {
6868 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6869 pFpuResTwo->r80Result2 = *pr80Val;
6870 }
6871
6872 fFsw |= X86_FSW_IE;
6873 if (!(fFcw & X86_FCW_IM))
6874 fFsw |= X86_FSW_ES | X86_FSW_B;
6875 }
6876
6877 pFpuResTwo->FSW = fFsw;
6878}
6879#endif /* IEM_WITHOUT_ASSEMBLY */
6880
6881IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6882{
6883 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6884}
6885
6886IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6887{
6888 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6889}
6890
6891#ifdef IEM_WITHOUT_ASSEMBLY
6892
6893
6894/*********************************************************************************************************************************
6895* x87 FPU Compare and Testing Operations *
6896*********************************************************************************************************************************/
6897
6898IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6899{
6900 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6901
6902 if (RTFLOAT80U_IS_ZERO(pr80Val))
6903 fFsw |= X86_FSW_C3;
6904 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6905 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6906 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6907 {
6908 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6909 if (!(pFpuState->FCW & X86_FCW_DM))
6910 fFsw |= X86_FSW_ES | X86_FSW_B;
6911 }
6912 else
6913 {
6914 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6915 if (!(pFpuState->FCW & X86_FCW_IM))
6916 fFsw |= X86_FSW_ES | X86_FSW_B;
6917 }
6918
6919 *pu16Fsw = fFsw;
6920}
6921
6922
6923IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6924{
6925 RT_NOREF(pFpuState);
6926 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6927
6928 /* C1 = sign bit (always, even if empty Intel says). */
6929 if (pr80Val->s.fSign)
6930 fFsw |= X86_FSW_C1;
6931
6932 /* Classify the value in C0, C2, C3. */
6933 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6934 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6935 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6936 fFsw |= X86_FSW_C2;
6937 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6938 fFsw |= X86_FSW_C3;
6939 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6940 fFsw |= X86_FSW_C0;
6941 else if (RTFLOAT80U_IS_INF(pr80Val))
6942 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6943 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6944 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6945 /* whatever else: 0 */
6946
6947 *pu16Fsw = fFsw;
6948}
6949
6950
6951/**
6952 * Worker for fcom, fucom, and friends.
6953 */
6954static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6955 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6956{
6957 /*
6958 * Unpack the values.
6959 */
6960 bool const fSign1 = pr80Val1->s.fSign;
6961 int32_t iExponent1 = pr80Val1->s.uExponent;
6962 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6963
6964 bool const fSign2 = pr80Val2->s.fSign;
6965 int32_t iExponent2 = pr80Val2->s.uExponent;
6966 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6967
6968 /*
6969 * Check for invalid inputs.
6970 */
6971 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6972 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6973 {
6974 if (!(fFcw & X86_FCW_IM))
6975 fFsw |= X86_FSW_ES | X86_FSW_B;
6976 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6977 }
6978
6979 /*
6980 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6981 */
6982 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6983 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6984 {
6985 if ( fIeOnAllNaNs
6986 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6987 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6988 {
6989 fFsw |= X86_FSW_IE;
6990 if (!(fFcw & X86_FCW_IM))
6991 fFsw |= X86_FSW_ES | X86_FSW_B;
6992 }
6993 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6994 }
6995
6996 /*
6997 * Normalize the values.
6998 */
6999 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7000 {
7001 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7002 iExponent1 = 1;
7003 else
7004 {
7005 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7006 uMantissa1 <<= iExponent1;
7007 iExponent1 = 1 - iExponent1;
7008 }
7009 fFsw |= X86_FSW_DE;
7010 if (!(fFcw & X86_FCW_DM))
7011 fFsw |= X86_FSW_ES | X86_FSW_B;
7012 }
7013
7014 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7015 {
7016 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7017 iExponent2 = 1;
7018 else
7019 {
7020 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7021 uMantissa2 <<= iExponent2;
7022 iExponent2 = 1 - iExponent2;
7023 }
7024 fFsw |= X86_FSW_DE;
7025 if (!(fFcw & X86_FCW_DM))
7026 fFsw |= X86_FSW_ES | X86_FSW_B;
7027 }
7028
7029 /*
7030 * Test if equal (val1 == val2):
7031 */
7032 if ( uMantissa1 == uMantissa2
7033 && iExponent1 == iExponent2
7034 && ( fSign1 == fSign2
7035 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7036 fFsw |= X86_FSW_C3;
7037 /*
7038 * Test if less than (val1 < val2):
7039 */
7040 else if (fSign1 && !fSign2)
7041 fFsw |= X86_FSW_C0;
7042 else if (fSign1 == fSign2)
7043 {
7044 /* Zeros are problematic, however at the most one can be zero here. */
7045 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7046 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7047 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7048 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7049
7050 if ( fSign1
7051 ^ ( iExponent1 < iExponent2
7052 || ( iExponent1 == iExponent2
7053 && uMantissa1 < uMantissa2 ) ) )
7054 fFsw |= X86_FSW_C0;
7055 }
7056 /* else: No flags set if greater. */
7057
7058 return fFsw;
7059}
7060
7061
7062IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7063 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7064{
7065 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7066}
7067
7068
7069
7070
7071IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7072 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7073{
7074 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7075}
7076
7077
7078IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7079 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7080{
7081 RTFLOAT80U r80Val2;
7082 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7083 Assert(!fFsw || fFsw == X86_FSW_DE);
7084 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7085 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7086 {
7087 if (!(pFpuState->FCW & X86_FCW_DM))
7088 fFsw |= X86_FSW_ES | X86_FSW_B;
7089 *pfFsw |= fFsw;
7090 }
7091}
7092
7093
7094IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7095 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7096{
7097 RTFLOAT80U r80Val2;
7098 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7099 Assert(!fFsw || fFsw == X86_FSW_DE);
7100 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7101 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7102 {
7103 if (!(pFpuState->FCW & X86_FCW_DM))
7104 fFsw |= X86_FSW_ES | X86_FSW_B;
7105 *pfFsw |= fFsw;
7106 }
7107}
7108
7109
7110IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7111 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7112{
7113 RTFLOAT80U r80Val2;
7114 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7115 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7116}
7117
7118
7119IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7120 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7121{
7122 RTFLOAT80U r80Val2;
7123 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7124 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7125}
7126
7127
7128/**
7129 * Worker for fcomi & fucomi.
7130 */
7131static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7132 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7133{
7134 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7135 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7136 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7137 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7138
7139 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7140 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7141 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7142}
7143
7144
7145IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7146 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7147{
7148 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7149}
7150
7151
7152IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7153 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7154{
7155 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7156}
7157
7158
7159/*********************************************************************************************************************************
7160* x87 FPU Other Operations *
7161*********************************************************************************************************************************/
7162
7163/**
7164 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7165 */
7166static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7167{
7168 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7169 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7170 true /*exact / generate #PE */, &SoftState));
7171 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7172}
7173
7174
7175IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7176{
7177 uint16_t const fFcw = pFpuState->FCW;
7178 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7179
7180 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7181 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7182 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7183 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7184 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7185 || RTFLOAT80U_IS_INF(pr80Val))
7186 pFpuRes->r80Result = *pr80Val;
7187 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7188 {
7189 fFsw |= X86_FSW_DE;
7190 if (fFcw & X86_FCW_DM)
7191 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7192 else
7193 {
7194 pFpuRes->r80Result = *pr80Val;
7195 fFsw |= X86_FSW_ES | X86_FSW_B;
7196 }
7197 }
7198 else
7199 {
7200 if (fFcw & X86_FCW_IM)
7201 {
7202 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7203 pFpuRes->r80Result = g_r80Indefinite;
7204 else
7205 {
7206 pFpuRes->r80Result = *pr80Val;
7207 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7208 }
7209 }
7210 else
7211 {
7212 pFpuRes->r80Result = *pr80Val;
7213 fFsw |= X86_FSW_ES | X86_FSW_B;
7214 }
7215 fFsw |= X86_FSW_IE;
7216 }
7217 pFpuRes->FSW = fFsw;
7218}
7219
7220
7221IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7222 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7223{
7224 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7225 it does everything we need it to do. */
7226 uint16_t const fFcw = pFpuState->FCW;
7227 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7228 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7229 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7230 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7231}
7232
7233
7234/**
7235 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7236 */
7237static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7238{
7239 Assert(!pr80Val->s.fSign);
7240 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7241 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7242 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7243}
7244
7245
7246IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7247{
7248 uint16_t const fFcw = pFpuState->FCW;
7249 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7250
7251 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7252 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7253 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7254 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7255 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7256 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7257 pFpuRes->r80Result = *pr80Val;
7258 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7259 {
7260 fFsw |= X86_FSW_DE;
7261 if (fFcw & X86_FCW_DM)
7262 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7263 else
7264 {
7265 pFpuRes->r80Result = *pr80Val;
7266 fFsw |= X86_FSW_ES | X86_FSW_B;
7267 }
7268 }
7269 else
7270 {
7271 if (fFcw & X86_FCW_IM)
7272 {
7273 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7274 pFpuRes->r80Result = g_r80Indefinite;
7275 else
7276 {
7277 pFpuRes->r80Result = *pr80Val;
7278 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7279 }
7280 }
7281 else
7282 {
7283 pFpuRes->r80Result = *pr80Val;
7284 fFsw |= X86_FSW_ES | X86_FSW_B;
7285 }
7286 fFsw |= X86_FSW_IE;
7287 }
7288 pFpuRes->FSW = fFsw;
7289}
7290
7291
7292/**
7293 * @code{.unparsed}
7294 * x x * ln2
7295 * f(x) = 2 - 1 = e - 1
7296 *
7297 * @endcode
7298 *
7299 * We can approximate e^x by a Taylor/Maclaurin series (see
7300 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7301 * @code{.unparsed}
7302 * n 0 1 2 3 4
7303 * inf x x x x x x
7304 * SUM ----- = --- + --- + --- + --- + --- + ...
7305 * n=0 n! 0! 1! 2! 3! 4!
7306 *
7307 * 2 3 4
7308 * x x x
7309 * = 1 + x + --- + --- + --- + ...
7310 * 2! 3! 4!
7311 * @endcode
7312 *
7313 * Given z = x * ln2, we get:
7314 * @code{.unparsed}
7315 * 2 3 4 n
7316 * z z z z z
7317 * e - 1 = z + --- + --- + --- + ... + ---
7318 * 2! 3! 4! n!
7319 * @endcode
7320 *
7321 * Wanting to use Horner's method, we move one z outside and get:
7322 * @code{.unparsed}
7323 * 2 3 (n-1)
7324 * z z z z
7325 * = z ( 1 + --- + --- + --- + ... + ------- )
7326 * 2! 3! 4! n!
7327 * @endcode
7328 *
7329 * The constants we need for using Horner's methods are 1 and 1 / n!.
7330 *
7331 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7332 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7333 * and can approximate it to be 1.0. For a visual demonstration of this
7334 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7335 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7336 *
7337 *
7338 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7339 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7340 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7341 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7342 * blocks). (The one bit difference is probably an implicit one missing from
7343 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7344 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7345 * exponent.
7346 *
7347 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7348 * successfully reproduced the exact results from an Intel 10980XE, there is
7349 * always a portition of rounding differences. Not going to spend too much time
7350 * on getting this 100% the same, at least not now.
7351 *
7352 * P.S. If someone are really curious about 8087 and its contstants:
7353 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7354 *
7355 *
7356 * @param pr80Val The exponent value (x), less than 1.0, greater than
7357 * -1.0 and not zero. This can be a normal, denormal
7358 * or pseudo-denormal value.
7359 * @param pr80Result Where to return the result.
7360 * @param fFcw FPU control word.
7361 * @param fFsw FPU status word.
7362 */
7363static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7364{
7365 /* As mentioned above, we can skip the expensive polynomial calculation
7366 as it will be close enough to 1.0 that it makes no difference.
7367
7368 The cutoff point for intel 10980XE is exponents >= -69. Intel
7369 also seems to be using a 67-bit or 68-bit constant value, and we get
7370 a smattering of rounding differences if we go for higher precision. */
7371 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7372 {
7373 RTUINT256U u256;
7374 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7375 u256.QWords.qw0 |= 1; /* force #PE */
7376 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7377 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7378 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7379 : 1 - RTFLOAT80U_EXP_BIAS,
7380 fFcw, fFsw);
7381 }
7382 else
7383 {
7384#ifdef IEM_WITH_FLOAT128_FOR_FPU
7385 /* This approach is not good enough for small values, we end up with zero. */
7386 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7387 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7388 _Float128 rd128Result = powf128(2.0L, rd128Val);
7389 rd128Result -= 1.0L;
7390 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7391 iemFpuF128RestoreRounding(fOldRounding);
7392
7393# else
7394 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7395 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7396
7397 /* As mentioned above, enforce 68-bit internal mantissa width to better
7398 match the Intel 10980XE results. */
7399 unsigned const cPrecision = 68;
7400
7401 /* first calculate z = x * ln2 */
7402 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7403 cPrecision);
7404
7405 /* Then do the polynomial evaluation. */
7406 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7407 cPrecision, &SoftState);
7408 r = f128_mul(z, r, &SoftState);
7409
7410 /* Output the result. */
7411 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7412# endif
7413 }
7414 return fFsw;
7415}
7416
7417
7418IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7419{
7420 uint16_t const fFcw = pFpuState->FCW;
7421 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7422
7423 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7424 {
7425 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7426 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7427 else
7428 {
7429 /* Special case:
7430 2^+1.0 - 1.0 = 1.0
7431 2^-1.0 - 1.0 = -0.5 */
7432 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7433 && pr80Val->s.uMantissa == RT_BIT_64(63))
7434 {
7435 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7436 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7437 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7438 }
7439 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7440 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7441 else
7442 pFpuRes->r80Result = *pr80Val;
7443 fFsw |= X86_FSW_PE;
7444 if (!(fFcw & X86_FCW_PM))
7445 fFsw |= X86_FSW_ES | X86_FSW_B;
7446 }
7447 }
7448 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7449 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7450 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7451 pFpuRes->r80Result = *pr80Val;
7452 else if (RTFLOAT80U_IS_INF(pr80Val))
7453 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7454 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7455 {
7456 fFsw |= X86_FSW_DE;
7457 if (fFcw & X86_FCW_DM)
7458 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7459 else
7460 {
7461 pFpuRes->r80Result = *pr80Val;
7462 fFsw |= X86_FSW_ES | X86_FSW_B;
7463 }
7464 }
7465 else
7466 {
7467 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7468 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7469 && (fFcw & X86_FCW_IM))
7470 pFpuRes->r80Result = g_r80Indefinite;
7471 else
7472 {
7473 pFpuRes->r80Result = *pr80Val;
7474 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7475 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7476 }
7477 fFsw |= X86_FSW_IE;
7478 if (!(fFcw & X86_FCW_IM))
7479 fFsw |= X86_FSW_ES | X86_FSW_B;
7480 }
7481 pFpuRes->FSW = fFsw;
7482}
7483
7484#endif /* IEM_WITHOUT_ASSEMBLY */
7485
7486IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7487{
7488 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7489}
7490
7491IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7492{
7493 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7494}
7495
7496#ifdef IEM_WITHOUT_ASSEMBLY
7497
7498IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7499{
7500 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7501 pFpuRes->r80Result = *pr80Val;
7502 pFpuRes->r80Result.s.fSign = 0;
7503}
7504
7505
7506IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7507{
7508 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7509 pFpuRes->r80Result = *pr80Val;
7510 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7511}
7512
7513
7514IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7515{
7516 uint16_t const fFcw = pFpuState->FCW;
7517 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7518
7519 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7520 {
7521 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7522 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7523
7524 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7525 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7526 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7527 }
7528 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7529 {
7530 fFsw |= X86_FSW_ZE;
7531 if (fFcw & X86_FCW_ZM)
7532 {
7533 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7534 pFpuResTwo->r80Result2 = *pr80Val;
7535 }
7536 else
7537 {
7538 pFpuResTwo->r80Result2 = *pr80Val;
7539 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7540 }
7541 }
7542 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7543 {
7544 fFsw |= X86_FSW_DE;
7545 if (fFcw & X86_FCW_DM)
7546 {
7547 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7548 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7549 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7550 int32_t iExponent = -16382;
7551 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7552 {
7553 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7554 iExponent--;
7555 }
7556
7557 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7558 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7559 }
7560 else
7561 {
7562 pFpuResTwo->r80Result2 = *pr80Val;
7563 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7564 }
7565 }
7566 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7567 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7568 {
7569 pFpuResTwo->r80Result1 = *pr80Val;
7570 pFpuResTwo->r80Result2 = *pr80Val;
7571 }
7572 else if (RTFLOAT80U_IS_INF(pr80Val))
7573 {
7574 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7575 pFpuResTwo->r80Result2 = *pr80Val;
7576 }
7577 else
7578 {
7579 if (fFcw & X86_FCW_IM)
7580 {
7581 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7582 pFpuResTwo->r80Result1 = g_r80Indefinite;
7583 else
7584 {
7585 pFpuResTwo->r80Result1 = *pr80Val;
7586 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7587 }
7588 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7589 }
7590 else
7591 {
7592 pFpuResTwo->r80Result2 = *pr80Val;
7593 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7594 }
7595 fFsw |= X86_FSW_IE;
7596 }
7597 pFpuResTwo->FSW = fFsw;
7598}
7599#endif /* IEM_WITHOUT_ASSEMBLY */
7600
7601#if defined(IEM_WITHOUT_ASSEMBLY)
7602
7603static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7604{
7605 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7606 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7607 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7608 extFloat80_t v;
7609 (void)fFcw;
7610
7611 v = extF80_ylog2x(y, x, &SoftState);
7612 iemFpuSoftF80ToIprt(pr80Result, v);
7613
7614 return fFsw;
7615}
7616
7617IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7618 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7619{
7620 uint16_t const fFcw = pFpuState->FCW;
7621 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7622
7623 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7624 {
7625 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7626
7627 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7628 if (!(fFcw & X86_FCW_PM))
7629 fFsw |= X86_FSW_ES | X86_FSW_B;
7630 }
7631 else
7632 {
7633 fFsw |= X86_FSW_IE;
7634
7635 if (!(fFcw & X86_FCW_IM))
7636 {
7637 pFpuRes->r80Result = *pr80Val2;
7638 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7639 }
7640 else
7641 {
7642 pFpuRes->r80Result = g_r80Indefinite;
7643 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7644 }
7645 }
7646
7647 pFpuRes->FSW = fFsw;
7648}
7649#endif /* IEM_WITHOUT_ASSEMBLY */
7650
7651IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7652 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7653{
7654 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7655}
7656
7657IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7658 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7659{
7660 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7661}
7662
7663#if defined(IEM_WITHOUT_ASSEMBLY)
7664
7665static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7666{
7667 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7668 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7669 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7670 extFloat80_t v;
7671 (void)fFcw;
7672
7673 v = extF80_ylog2xp1(y, x, &SoftState);
7674 iemFpuSoftF80ToIprt(pr80Result, v);
7675
7676 return fFsw;
7677}
7678
7679IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7680 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7681{
7682 uint16_t const fFcw = pFpuState->FCW;
7683 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7684
7685 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7686 {
7687 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7688
7689 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7690 if (!(fFcw & X86_FCW_PM))
7691 fFsw |= X86_FSW_ES | X86_FSW_B;
7692 }
7693 else
7694 {
7695 fFsw |= X86_FSW_IE;
7696
7697 if (!(fFcw & X86_FCW_IM))
7698 {
7699 pFpuRes->r80Result = *pr80Val2;
7700 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7701 }
7702 else
7703 {
7704 pFpuRes->r80Result = g_r80Indefinite;
7705 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7706 }
7707 }
7708
7709 pFpuRes->FSW = fFsw;
7710}
7711
7712#endif /* IEM_WITHOUT_ASSEMBLY */
7713
7714IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7715 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7716{
7717 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7718}
7719
7720IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7721 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7722{
7723 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7724}
7725
7726
7727/*********************************************************************************************************************************
7728* MMX, SSE & AVX *
7729*********************************************************************************************************************************/
7730
7731#ifdef IEM_WITH_VEX
7732
7733/*
7734 * VMOVSLDUP
7735 */
7736IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7737{
7738 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7739 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7740 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7741 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7742 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7743 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7744 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7745 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7746}
7747
7748
7749IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7750{
7751 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7752 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7753 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7754 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7755 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7756 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7757 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7758 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7759}
7760
7761#endif /* IEM_WITH_VEX */
7762
7763
7764#ifdef IEM_WITH_VEX
7765
7766/*
7767 * VMOVSHDUP
7768 */
7769IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7770{
7771 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7772 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7773 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7774 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7775 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7776 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7777 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7778 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7779}
7780
7781
7782IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7783{
7784 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7785 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7786 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7787 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7788 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7789 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7790 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7791 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7792}
7793
7794#endif /* IEM_WITH_VEX */
7795
7796
7797#ifdef IEM_WITH_VEX
7798
7799/*
7800 * VMOVDDUP
7801 */
7802IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7803{
7804 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7805 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7806 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7807 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7808}
7809
7810IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7811{
7812 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7813 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7814 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7815 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7816}
7817
7818#endif /* IEM_WITH_VEX */
7819
7820
7821/*
7822 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7823 */
7824#ifdef IEM_WITHOUT_ASSEMBLY
7825
7826IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7827{
7828 RT_NOREF(pFpuState);
7829 *puDst &= *puSrc;
7830}
7831
7832
7833IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7834{
7835 RT_NOREF(pFpuState);
7836 puDst->au64[0] &= puSrc->au64[0];
7837 puDst->au64[1] &= puSrc->au64[1];
7838}
7839
7840#endif
7841
7842IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7843 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7844{
7845 RT_NOREF(pExtState);
7846 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7847 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7848}
7849
7850
7851IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7852 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7853{
7854 RT_NOREF(pExtState);
7855 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7856 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7857 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7858 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7859}
7860
7861
7862/*
7863 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7864 */
7865#ifdef IEM_WITHOUT_ASSEMBLY
7866
7867IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7868{
7869 RT_NOREF(pFpuState);
7870 *puDst = ~*puDst & *puSrc;
7871}
7872
7873
7874IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7875{
7876 RT_NOREF(pFpuState);
7877 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7878 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7879}
7880
7881#endif
7882
7883IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7884 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7885{
7886 RT_NOREF(pExtState);
7887 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7888 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7889}
7890
7891
7892IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7893 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7894{
7895 RT_NOREF(pExtState);
7896 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7897 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7898 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7899 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7900}
7901
7902
7903/*
7904 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7905 */
7906#ifdef IEM_WITHOUT_ASSEMBLY
7907
7908IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7909{
7910 RT_NOREF(pFpuState);
7911 *puDst |= *puSrc;
7912}
7913
7914
7915IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7916{
7917 RT_NOREF(pFpuState);
7918 puDst->au64[0] |= puSrc->au64[0];
7919 puDst->au64[1] |= puSrc->au64[1];
7920}
7921
7922#endif
7923
7924IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7925 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7926{
7927 RT_NOREF(pExtState);
7928 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7929 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7930}
7931
7932
7933IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7934 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7935{
7936 RT_NOREF(pExtState);
7937 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7938 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7939 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7940 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7941}
7942
7943
7944/*
7945 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7946 */
7947#ifdef IEM_WITHOUT_ASSEMBLY
7948
7949IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7950{
7951 RT_NOREF(pFpuState);
7952 *puDst ^= *puSrc;
7953}
7954
7955
7956IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7957{
7958 RT_NOREF(pFpuState);
7959 puDst->au64[0] ^= puSrc->au64[0];
7960 puDst->au64[1] ^= puSrc->au64[1];
7961}
7962
7963#endif
7964
7965IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7966 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7967{
7968 RT_NOREF(pExtState);
7969 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7970 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7971}
7972
7973
7974IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7975 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7976{
7977 RT_NOREF(pExtState);
7978 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7979 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7980 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7981 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7982}
7983
7984
7985/*
7986 * PCMPEQB / VPCMPEQB
7987 */
7988#ifdef IEM_WITHOUT_ASSEMBLY
7989
7990IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7991{
7992 RT_NOREF(pFpuState);
7993 RTUINT64U uSrc1 = { *puDst };
7994 RTUINT64U uSrc2 = { *puSrc };
7995 RTUINT64U uDst;
7996 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7997 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7998 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7999 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
8000 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
8001 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
8002 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
8003 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
8004 *puDst = uDst.u;
8005}
8006
8007
8008IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8009{
8010 RT_NOREF(pFpuState);
8011 RTUINT128U uSrc1 = *puDst;
8012 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
8013 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
8014 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
8015 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
8016 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
8017 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
8018 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
8019 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
8020 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
8021 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
8022 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
8023 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
8024 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
8025 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
8026 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
8027 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
8028}
8029
8030#endif
8031
8032IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8033 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8034{
8035 RT_NOREF(pExtState);
8036 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8037 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8038 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8039 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8040 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8041 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8042 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8043 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8044 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8045 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8046 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8047 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8048 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8049 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8050 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8051 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8052}
8053
8054IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8055 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8056{
8057 RT_NOREF(pExtState);
8058 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8059 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8060 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8061 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8062 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8063 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8064 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8065 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8066 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8067 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8068 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8069 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8070 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8071 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8072 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8073 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8074 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8075 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8076 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8077 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8078 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8079 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8080 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8081 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8082 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8083 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8084 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8085 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8086 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8087 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8088 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8089 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8090}
8091
8092
8093/*
8094 * PCMPEQW / VPCMPEQW
8095 */
8096#ifdef IEM_WITHOUT_ASSEMBLY
8097
8098IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8099{
8100 RT_NOREF(pFpuState);
8101 RTUINT64U uSrc1 = { *puDst };
8102 RTUINT64U uSrc2 = { *puSrc };
8103 RTUINT64U uDst;
8104 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8105 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8106 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8107 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8108 *puDst = uDst.u;
8109}
8110
8111
8112IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8113{
8114 RT_NOREF(pFpuState);
8115 RTUINT128U uSrc1 = *puDst;
8116 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8117 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8118 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8119 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8120 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8121 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8122 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8123 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8124}
8125
8126#endif
8127
8128IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8129 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8130{
8131 RT_NOREF(pExtState);
8132 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8133 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8134 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8135 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8136 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8137 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8138 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8139 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8140}
8141
8142IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8143 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8144{
8145 RT_NOREF(pExtState);
8146 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8147 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8148 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8149 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8150 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8151 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8152 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8153 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8154 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8155 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8156 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8157 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8158 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8159 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8160 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8161 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8162}
8163
8164
8165/*
8166 * PCMPEQD / VPCMPEQD.
8167 */
8168#ifdef IEM_WITHOUT_ASSEMBLY
8169
8170IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8171{
8172 RT_NOREF(pFpuState);
8173 RTUINT64U uSrc1 = { *puDst };
8174 RTUINT64U uSrc2 = { *puSrc };
8175 RTUINT64U uDst;
8176 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8177 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8178 *puDst = uDst.u;
8179}
8180
8181
8182IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8183{
8184 RT_NOREF(pFpuState);
8185 RTUINT128U uSrc1 = *puDst;
8186 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8187 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8188 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8189 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8190}
8191
8192#endif /* IEM_WITHOUT_ASSEMBLY */
8193
8194IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8195 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8196{
8197 RT_NOREF(pExtState);
8198 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8199 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8200 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8201 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8202}
8203
8204IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8205 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8206{
8207 RT_NOREF(pExtState);
8208 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8209 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8210 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8211 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8212 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8213 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8214 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8215 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8216}
8217
8218
8219/*
8220 * PCMPEQQ / VPCMPEQQ.
8221 */
8222IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8223{
8224 RT_NOREF(pFpuState);
8225 RTUINT128U uSrc1 = *puDst;
8226 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8227 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8228}
8229
8230IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8231 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8232{
8233 RT_NOREF(pExtState);
8234 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8235 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8236}
8237
8238IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8239 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8240{
8241 RT_NOREF(pExtState);
8242 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8243 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8244 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8245 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8246}
8247
8248
8249/*
8250 * PCMPGTB / VPCMPGTB
8251 */
8252#ifdef IEM_WITHOUT_ASSEMBLY
8253
8254IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8255{
8256 RT_NOREF(pFpuState);
8257 RTUINT64U uSrc1 = { *puDst };
8258 RTUINT64U uSrc2 = { *puSrc };
8259 RTUINT64U uDst;
8260 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8261 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8262 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8263 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8264 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8265 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8266 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8267 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8268 *puDst = uDst.u;
8269}
8270
8271
8272IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8273{
8274 RT_NOREF(pFpuState);
8275 RTUINT128U uSrc1 = *puDst;
8276 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8277 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8278 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8279 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8280 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8281 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8282 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8283 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8284 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8285 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8286 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8287 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8288 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8289 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8290 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8291 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8292}
8293
8294#endif
8295
8296IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8297 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8298{
8299 RT_NOREF(pExtState);
8300 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8301 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8302 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8303 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8304 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8305 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8306 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8307 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8308 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8309 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8310 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8311 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8312 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8313 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8314 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8315 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8316}
8317
8318IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8319 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8320{
8321 RT_NOREF(pExtState);
8322 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8323 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8324 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8325 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8326 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8327 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8328 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8329 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8330 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8331 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8332 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8333 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8334 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8335 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8336 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8337 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8338 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8339 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8340 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8341 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8342 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8343 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8344 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8345 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8346 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8347 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8348 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8349 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8350 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8351 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8352 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8353 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8354}
8355
8356
8357/*
8358 * PCMPGTW / VPCMPGTW
8359 */
8360#ifdef IEM_WITHOUT_ASSEMBLY
8361
8362IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8363{
8364 RT_NOREF(pFpuState);
8365 RTUINT64U uSrc1 = { *puDst };
8366 RTUINT64U uSrc2 = { *puSrc };
8367 RTUINT64U uDst;
8368 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8369 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8370 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8371 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8372 *puDst = uDst.u;
8373}
8374
8375
8376IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8377{
8378 RT_NOREF(pFpuState);
8379 RTUINT128U uSrc1 = *puDst;
8380 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8381 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8382 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8383 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8384 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8385 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8386 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8387 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8388}
8389
8390#endif
8391
8392IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8393 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8394{
8395 RT_NOREF(pExtState);
8396 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8397 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8398 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8399 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8400 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8401 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8402 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8403 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8404}
8405
8406IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8407 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8408{
8409 RT_NOREF(pExtState);
8410 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8411 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8412 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8413 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8414 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8415 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8416 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8417 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8418 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8419 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8420 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8421 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8422 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8423 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8424 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8425 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8426}
8427
8428
8429/*
8430 * PCMPGTD / VPCMPGTD.
8431 */
8432#ifdef IEM_WITHOUT_ASSEMBLY
8433
8434IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8435{
8436 RT_NOREF(pFpuState);
8437 RTUINT64U uSrc1 = { *puDst };
8438 RTUINT64U uSrc2 = { *puSrc };
8439 RTUINT64U uDst;
8440 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8441 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8442 *puDst = uDst.u;
8443}
8444
8445
8446IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8447{
8448 RT_NOREF(pFpuState);
8449 RTUINT128U uSrc1 = *puDst;
8450 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8451 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8452 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8453 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8454}
8455
8456#endif /* IEM_WITHOUT_ASSEMBLY */
8457
8458IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8459 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8460{
8461 RT_NOREF(pExtState);
8462 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8463 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8464 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8465 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8466}
8467
8468IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8469 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8470{
8471 RT_NOREF(pExtState);
8472 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8473 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8474 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8475 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8476 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8477 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8478 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8479 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8480}
8481
8482
8483/*
8484 * PCMPGTQ / VPCMPGTQ.
8485 */
8486IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8487{
8488 RT_NOREF(pFpuState);
8489 RTUINT128U uSrc1 = *puDst;
8490 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8491 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8492}
8493
8494IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8495 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8496{
8497 RT_NOREF(pExtState);
8498 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8499 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8500}
8501
8502IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8503 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8504{
8505 RT_NOREF(pExtState);
8506 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8507 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8508 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8509 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8510}
8511
8512
8513/*
8514 * PADDB / VPADDB
8515 */
8516#ifdef IEM_WITHOUT_ASSEMBLY
8517
8518IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8519{
8520 RT_NOREF(pFpuState);
8521 RTUINT64U uSrc1 = { *puDst };
8522 RTUINT64U uSrc2 = { *puSrc };
8523 RTUINT64U uDst;
8524 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8525 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8526 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8527 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8528 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8529 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8530 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8531 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8532 *puDst = uDst.u;
8533}
8534
8535
8536IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8537{
8538 RT_NOREF(pFpuState);
8539 RTUINT128U uSrc1 = *puDst;
8540 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8541 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8542 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8543 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8544 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8545 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8546 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8547 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8548 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8549 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8550 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8551 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8552 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8553 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8554 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8555 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8556}
8557
8558#endif
8559
8560
8561IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8562 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8563{
8564 RT_NOREF(pExtState);
8565 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8566 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8567 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8568 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8569 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8570 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8571 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8572 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8573 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8574 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8575 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8576 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8577 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8578 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8579 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8580 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8581}
8582
8583IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8584 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8585{
8586 RT_NOREF(pExtState);
8587 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8588 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8589 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8590 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8591 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8592 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8593 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8594 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8595 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8596 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8597 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8598 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8599 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8600 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8601 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8602 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8603 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8604 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8605 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8606 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8607 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8608 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8609 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8610 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8611 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8612 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8613 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8614 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8615 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8616 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8617 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8618 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8619}
8620
8621
8622/*
8623 * PADDSB / VPADDSB
8624 */
8625#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8626 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8627 ? (uint8_t)(a_iWord) \
8628 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8629
8630#ifdef IEM_WITHOUT_ASSEMBLY
8631
8632IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8633{
8634 RT_NOREF(pFpuState);
8635 RTUINT64U uSrc1 = { *puDst };
8636 RTUINT64U uSrc2 = { *puSrc };
8637 RTUINT64U uDst;
8638 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8639 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8640 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8641 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8642 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8643 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8644 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8645 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8646 *puDst = uDst.u;
8647}
8648
8649
8650IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8651{
8652 RT_NOREF(pFpuState);
8653 RTUINT128U uSrc1 = *puDst;
8654 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8655 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8656 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8657 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8658 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8659 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8660 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8661 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8662 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8663 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8664 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8665 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8666 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8667 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8668 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8669 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8670}
8671
8672#endif
8673
8674IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8675 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8676{
8677 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8678 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8679 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8680 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8681 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8682 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8683 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8684 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8685 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8686 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8687 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8688 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8689 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8690 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8691 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8692 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8693}
8694
8695IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8696 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8697{
8698 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8699 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8700 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8701 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8702 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8703 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8704 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8705 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8706 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8707 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8708 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8709 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8710 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8711 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8712 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8713 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8714 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8715 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8716 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8717 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8718 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8719 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8720 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8721 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8722 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8723 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8724 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8725 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8726 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8727 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8728 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8729 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8730}
8731
8732
8733/*
8734 * PADDUSB / VPADDUSB
8735 */
8736#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8737 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8738 ? (uint8_t)(a_uWord) \
8739 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8740
8741#ifdef IEM_WITHOUT_ASSEMBLY
8742
8743IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8744{
8745 RT_NOREF(pFpuState);
8746 RTUINT64U uSrc1 = { *puDst };
8747 RTUINT64U uSrc2 = { *puSrc };
8748 RTUINT64U uDst;
8749 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8750 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8751 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8752 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8753 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8754 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8755 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8756 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8757 *puDst = uDst.u;
8758}
8759
8760
8761IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8762{
8763 RT_NOREF(pFpuState);
8764 RTUINT128U uSrc1 = *puDst;
8765 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8766 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8767 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8768 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8769 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8770 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8771 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8772 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8773 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8774 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8775 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8776 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8777 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8778 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8779 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8780 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8781}
8782
8783#endif
8784
8785IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8786 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8787{
8788 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8789 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8790 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8791 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8792 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8793 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8794 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8795 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8796 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8797 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8798 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8799 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8800 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8801 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8802 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8803 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8804}
8805
8806IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8807 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8808{
8809 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8810 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8811 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8812 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8813 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8814 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8815 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8816 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8817 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8818 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8819 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8820 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8821 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8822 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8823 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8824 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8825 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8826 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8827 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8828 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8829 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8830 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8831 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8832 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8833 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8834 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8835 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8836 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8837 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8838 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8839 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8840 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8841}
8842
8843
8844/*
8845 * PADDW / VPADDW
8846 */
8847#ifdef IEM_WITHOUT_ASSEMBLY
8848
8849IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8850{
8851 RT_NOREF(pFpuState);
8852 RTUINT64U uSrc1 = { *puDst };
8853 RTUINT64U uSrc2 = { *puSrc };
8854 RTUINT64U uDst;
8855 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8856 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8857 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8858 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8859 *puDst = uDst.u;
8860}
8861
8862
8863IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8864{
8865 RT_NOREF(pFpuState);
8866 RTUINT128U uSrc1 = *puDst;
8867 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8868 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8869 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8870 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8871 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8872 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8873 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8874 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8875}
8876
8877#endif
8878
8879
8880IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8881 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8882{
8883 RT_NOREF(pExtState);
8884 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8885 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8886 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8887 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8888 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8889 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8890 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8891 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8892}
8893
8894IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8895 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8896{
8897 RT_NOREF(pExtState);
8898 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8899 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8900 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8901 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8902 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8903 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8904 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8905 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8906 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8907 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8908 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8909 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8910 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8911 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8912 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8913 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8914}
8915
8916
8917/*
8918 * PADDSW / VPADDSW
8919 */
8920#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8921 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8922 ? (uint16_t)(a_iDword) \
8923 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8924
8925#ifdef IEM_WITHOUT_ASSEMBLY
8926
8927IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8928{
8929 RT_NOREF(pFpuState);
8930 RTUINT64U uSrc1 = { *puDst };
8931 RTUINT64U uSrc2 = { *puSrc };
8932 RTUINT64U uDst;
8933 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8934 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8935 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8936 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8937 *puDst = uDst.u;
8938}
8939
8940
8941IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8942{
8943 RT_NOREF(pFpuState);
8944 RTUINT128U uSrc1 = *puDst;
8945 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8946 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8947 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8948 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8949 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8950 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8951 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8952 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8953}
8954
8955#endif
8956
8957IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8958 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8959{
8960 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8961 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8962 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8963 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8964 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8965 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8966 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8967 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8968}
8969
8970IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8971 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8972{
8973 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8974 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8975 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8976 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8977 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8978 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8979 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8980 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8981 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8982 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8983 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8984 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8985 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8986 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8987 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8988 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8989}
8990
8991
8992/*
8993 * PADDUSW / VPADDUSW
8994 */
8995#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8996 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8997 ? (uint16_t)(a_uDword) \
8998 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8999
9000#ifdef IEM_WITHOUT_ASSEMBLY
9001
9002IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9003{
9004 RT_NOREF(pFpuState);
9005 RTUINT64U uSrc1 = { *puDst };
9006 RTUINT64U uSrc2 = { *puSrc };
9007 RTUINT64U uDst;
9008 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
9009 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
9010 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
9011 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
9012 *puDst = uDst.u;
9013}
9014
9015
9016IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9017{
9018 RT_NOREF(pFpuState);
9019 RTUINT128U uSrc1 = *puDst;
9020 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
9021 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
9022 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
9023 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
9024 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
9025 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
9026 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
9027 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
9028}
9029
9030#endif
9031
9032IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
9033 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9034{
9035 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9036 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9037 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9038 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9039 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9040 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9041 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9042 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9043}
9044
9045IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
9046 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9047{
9048 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9049 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9050 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9051 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9052 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9053 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9054 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9055 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9056 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
9057 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
9058 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
9059 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
9060 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
9061 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
9062 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
9063 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
9064}
9065
9066
9067/*
9068 * PADDD / VPADDD.
9069 */
9070#ifdef IEM_WITHOUT_ASSEMBLY
9071
9072IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9073{
9074 RT_NOREF(pFpuState);
9075 RTUINT64U uSrc1 = { *puDst };
9076 RTUINT64U uSrc2 = { *puSrc };
9077 RTUINT64U uDst;
9078 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
9079 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
9080 *puDst = uDst.u;
9081}
9082
9083
9084IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9085{
9086 RT_NOREF(pFpuState);
9087 RTUINT128U uSrc1 = *puDst;
9088 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
9089 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
9090 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
9091 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
9092}
9093
9094#endif /* IEM_WITHOUT_ASSEMBLY */
9095
9096IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9097 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9098{
9099 RT_NOREF(pExtState);
9100 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9101 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9102 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9103 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9104}
9105
9106IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9107 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9108{
9109 RT_NOREF(pExtState);
9110 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9111 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9112 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9113 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9114 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
9115 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
9116 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
9117 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
9118}
9119
9120
9121/*
9122 * PADDQ / VPADDQ.
9123 */
9124#ifdef IEM_WITHOUT_ASSEMBLY
9125
9126IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9127{
9128 RT_NOREF(pFpuState);
9129 *puDst = *puDst + *puSrc;
9130}
9131
9132IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9133{
9134 RT_NOREF(pFpuState);
9135 RTUINT128U uSrc1 = *puDst;
9136 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
9137 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
9138}
9139
9140#endif
9141
9142IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9143 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9144{
9145 RT_NOREF(pExtState);
9146 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9147 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9148}
9149
9150IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9151 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9152{
9153 RT_NOREF(pExtState);
9154 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9155 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9156 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9157 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9158}
9159
9160
9161/*
9162 * PSUBB / VPSUBB
9163 */
9164#ifdef IEM_WITHOUT_ASSEMBLY
9165
9166IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9167{
9168 RT_NOREF(pFpuState);
9169 RTUINT64U uSrc1 = { *puDst };
9170 RTUINT64U uSrc2 = { *puSrc };
9171 RTUINT64U uDst;
9172 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9173 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9174 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9175 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9176 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9177 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9178 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9179 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9180 *puDst = uDst.u;
9181}
9182
9183
9184IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9185{
9186 RT_NOREF(pFpuState);
9187 RTUINT128U uSrc1 = *puDst;
9188 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9189 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9190 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9191 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9192 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9193 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9194 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9195 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9196 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9197 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9198 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9199 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9200 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9201 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9202 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9203 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9204}
9205
9206#endif
9207
9208IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9209 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9210{
9211 RT_NOREF(pExtState);
9212 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9213 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9214 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9215 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9216 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9217 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9218 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9219 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9220 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9221 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9222 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9223 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9224 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9225 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9226 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9227 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9228}
9229
9230IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9231 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9232{
9233 RT_NOREF(pExtState);
9234 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9235 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9236 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9237 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9238 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9239 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9240 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9241 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9242 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9243 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9244 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9245 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9246 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9247 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9248 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9249 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9250 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9251 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9252 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9253 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9254 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9255 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9256 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9257 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9258 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9259 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9260 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9261 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9262 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9263 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9264 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9265 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9266}
9267
9268
9269/*
9270 * PSUBSB / VSUBSB
9271 */
9272#ifdef IEM_WITHOUT_ASSEMBLY
9273
9274IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9275{
9276 RT_NOREF(pFpuState);
9277 RTUINT64U uSrc1 = { *puDst };
9278 RTUINT64U uSrc2 = { *puSrc };
9279 RTUINT64U uDst;
9280 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9281 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9282 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9283 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9284 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9285 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9286 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9287 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9288 *puDst = uDst.u;
9289}
9290
9291
9292IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9293{
9294 RT_NOREF(pFpuState);
9295 RTUINT128U uSrc1 = *puDst;
9296 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9297 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9298 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9299 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9300 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9301 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9302 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9303 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9304 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9305 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9306 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9307 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9308 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9309 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9310 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9311 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9312}
9313
9314#endif
9315
9316IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9317 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9318{
9319 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9320 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9321 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9322 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9323 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9324 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9325 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9326 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9327 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9328 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9329 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9330 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9331 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9332 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9333 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9334 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9335}
9336
9337IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9338 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9339{
9340 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9341 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9342 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9343 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9344 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9345 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9346 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9347 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9348 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9349 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9350 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9351 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9352 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9353 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9354 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9355 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9356 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9357 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9358 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9359 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9360 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9361 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9362 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9363 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9364 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9365 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9366 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9367 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9368 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9369 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9370 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9371 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9372}
9373
9374
9375/*
9376 * PSUBUSB / VPSUBUSW
9377 */
9378#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9379 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9380 ? (uint8_t)(a_uWord) \
9381 : (uint8_t)0 )
9382
9383#ifdef IEM_WITHOUT_ASSEMBLY
9384
9385IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9386{
9387 RT_NOREF(pFpuState);
9388 RTUINT64U uSrc1 = { *puDst };
9389 RTUINT64U uSrc2 = { *puSrc };
9390 RTUINT64U uDst;
9391 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9392 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9393 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9394 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9395 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9396 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9397 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9398 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9399 *puDst = uDst.u;
9400}
9401
9402
9403IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9404{
9405 RT_NOREF(pFpuState);
9406 RTUINT128U uSrc1 = *puDst;
9407 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9408 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9409 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9410 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9411 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9412 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9413 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9414 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9415 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9416 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9417 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9418 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9419 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9420 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9421 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9422 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9423}
9424
9425#endif
9426
9427IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9428 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9429{
9430 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9431 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9432 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9433 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9434 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9435 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9436 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9437 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9438 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9439 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9440 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9441 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9442 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9443 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9444 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9445 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9446}
9447
9448IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9449 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9450{
9451 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9452 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9453 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9454 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9455 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9456 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9457 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9458 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9459 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9460 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9461 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9462 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9463 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9464 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9465 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9466 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9467 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9468 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9469 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9470 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9471 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9472 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9473 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9474 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9475 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9476 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9477 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9478 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9479 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9480 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9481 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9482 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9483}
9484
9485
9486/*
9487 * PSUBW / VPSUBW
9488 */
9489#ifdef IEM_WITHOUT_ASSEMBLY
9490
9491IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9492{
9493 RT_NOREF(pFpuState);
9494 RTUINT64U uSrc1 = { *puDst };
9495 RTUINT64U uSrc2 = { *puSrc };
9496 RTUINT64U uDst;
9497 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9498 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9499 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9500 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9501 *puDst = uDst.u;
9502}
9503
9504
9505IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9506{
9507 RT_NOREF(pFpuState);
9508 RTUINT128U uSrc1 = *puDst;
9509 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9510 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9511 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9512 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9513 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9514 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9515 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9516 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9517}
9518
9519#endif
9520
9521IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9522 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9523{
9524 RT_NOREF(pExtState);
9525 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9526 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9527 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9528 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9529 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9530 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9531 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9532 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9533}
9534
9535IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9536 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9537{
9538 RT_NOREF(pExtState);
9539 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9540 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9541 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9542 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9543 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9544 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9545 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9546 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9547 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9548 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9549 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9550 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9551 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9552 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9553 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9554 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9555}
9556
9557
9558/*
9559 * PSUBSW / VPSUBSW
9560 */
9561#ifdef IEM_WITHOUT_ASSEMBLY
9562
9563IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9564{
9565 RT_NOREF(pFpuState);
9566 RTUINT64U uSrc1 = { *puDst };
9567 RTUINT64U uSrc2 = { *puSrc };
9568 RTUINT64U uDst;
9569 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9570 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9571 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9572 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9573 *puDst = uDst.u;
9574}
9575
9576
9577IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9578{
9579 RT_NOREF(pFpuState);
9580 RTUINT128U uSrc1 = *puDst;
9581 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9582 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9583 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9584 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9585 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9586 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9587 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9588 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9589}
9590
9591#endif
9592
9593IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9594 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9595{
9596 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9597 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9598 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9599 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9600 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9601 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9602 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9603 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9604}
9605
9606IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9607 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9608{
9609 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9610 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9611 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9612 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9613 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9614 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9615 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9616 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9617 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9618 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9619 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9620 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9621 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9622 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9623 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9624 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9625}
9626
9627
9628/*
9629 * PSUBUSW / VPSUBUSW
9630 */
9631#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9632 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9633 ? (uint16_t)(a_uDword) \
9634 : (uint16_t)0 )
9635
9636#ifdef IEM_WITHOUT_ASSEMBLY
9637
9638IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9639{
9640 RT_NOREF(pFpuState);
9641 RTUINT64U uSrc1 = { *puDst };
9642 RTUINT64U uSrc2 = { *puSrc };
9643 RTUINT64U uDst;
9644 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9645 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9646 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9647 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9648 *puDst = uDst.u;
9649}
9650
9651
9652IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9653{
9654 RT_NOREF(pFpuState);
9655 RTUINT128U uSrc1 = *puDst;
9656 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9657 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9658 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9659 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9660 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9661 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9662 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9663 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9664}
9665
9666#endif
9667
9668IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9669 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9670{
9671 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9672 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9673 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9674 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9675 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9676 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9677 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9678 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9679}
9680
9681IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9682 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9683{
9684 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9685 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9686 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9687 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9688 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9689 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9690 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9691 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9692 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9693 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9694 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9695 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9696 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9697 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9698 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9699 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9700}
9701
9702
9703
9704/*
9705 * PSUBD / VPSUBD.
9706 */
9707#ifdef IEM_WITHOUT_ASSEMBLY
9708
9709IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9710{
9711 RT_NOREF(pFpuState);
9712 RTUINT64U uSrc1 = { *puDst };
9713 RTUINT64U uSrc2 = { *puSrc };
9714 RTUINT64U uDst;
9715 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9716 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9717 *puDst = uDst.u;
9718}
9719
9720
9721IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9722{
9723 RT_NOREF(pFpuState);
9724 RTUINT128U uSrc1 = *puDst;
9725 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9726 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9727 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9728 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9729}
9730
9731#endif /* IEM_WITHOUT_ASSEMBLY */
9732
9733IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9734 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9735{
9736 RT_NOREF(pExtState);
9737 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9738 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9739 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9740 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9741}
9742
9743IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9744 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9745{
9746 RT_NOREF(pExtState);
9747 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9748 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9749 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9750 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9751 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9752 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9753 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9754 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9755}
9756
9757
9758/*
9759 * PSUBQ / VPSUBQ.
9760 */
9761#ifdef IEM_WITHOUT_ASSEMBLY
9762
9763IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9764{
9765 RT_NOREF(pFpuState);
9766 *puDst = *puDst - *puSrc;
9767}
9768
9769IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9770{
9771 RT_NOREF(pFpuState);
9772 RTUINT128U uSrc1 = *puDst;
9773 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9774 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9775}
9776
9777#endif
9778
9779IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9780 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9781{
9782 RT_NOREF(pExtState);
9783 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9784 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9785}
9786
9787IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9788 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9789{
9790 RT_NOREF(pExtState);
9791 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9792 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9793 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9794 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9795}
9796
9797
9798
9799/*
9800 * PMULLW / VPMULLW / PMULLD / VPMULLD
9801 */
9802#ifdef IEM_WITHOUT_ASSEMBLY
9803
9804IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9805{
9806 RT_NOREF(pFpuState);
9807 RTUINT64U uSrc1 = { *puDst };
9808 RTUINT64U uSrc2 = { *puSrc };
9809 RTUINT64U uDst;
9810 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9811 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9812 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9813 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9814 *puDst = uDst.u;
9815}
9816
9817
9818IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9819{
9820 RT_NOREF(pFpuState);
9821 RTUINT128U uSrc1 = *puDst;
9822 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9823 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9824 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9825 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9826 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9827 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9828 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9829 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9830}
9831
9832#endif
9833
9834IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9835{
9836 RTUINT128U uSrc1 = *puDst;
9837
9838 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9839 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9840 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9841 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9842 RT_NOREF(pFpuState);
9843}
9844
9845
9846IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9847{
9848 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9849 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9850 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9851 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9852 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9853 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9854 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9855 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9856}
9857
9858
9859IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9860{
9861 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9862 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9863 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9864 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9865 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9866 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9867 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9868 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9869 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9870 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9871 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9872 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9873 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9874 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9875 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9876 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9877}
9878
9879
9880IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9881{
9882 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9883 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9884 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9885 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9886}
9887
9888
9889IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9890{
9891 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9892 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9893 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9894 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9895 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9896 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9897 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9898 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9899}
9900
9901
9902/*
9903 * PMULHW / VPMULHW
9904 */
9905#ifdef IEM_WITHOUT_ASSEMBLY
9906
9907IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9908{
9909 RT_NOREF(pFpuState);
9910 RTUINT64U uSrc1 = { *puDst };
9911 RTUINT64U uSrc2 = { *puSrc };
9912 RTUINT64U uDst;
9913 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9914 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9915 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9916 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9917 *puDst = uDst.u;
9918}
9919
9920
9921IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9922{
9923 RT_NOREF(pFpuState);
9924 RTUINT128U uSrc1 = *puDst;
9925 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9926 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9927 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9928 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9929 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9930 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9931 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9932 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9933}
9934
9935#endif
9936
9937IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9938{
9939 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9940 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9941 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9942 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9943 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9944 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9945 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9946 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9947}
9948
9949
9950IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9951{
9952 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9953 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9954 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9955 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9956 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9957 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9958 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9959 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9960 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9961 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9962 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9963 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9964 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9965 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9966 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9967 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9968}
9969
9970
9971/*
9972 * PMULHUW / VPMULHUW
9973 */
9974#ifdef IEM_WITHOUT_ASSEMBLY
9975
9976IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9977{
9978 RTUINT64U uSrc1 = { *puDst };
9979 RTUINT64U uSrc2 = { *puSrc };
9980 RTUINT64U uDst;
9981 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9982 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9983 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9984 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9985 *puDst = uDst.u;
9986}
9987
9988
9989IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9990{
9991 RTUINT128U uSrc1 = *puDst;
9992 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9993 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9994 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9995 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9996 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9997 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9998 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9999 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
10000}
10001
10002#endif
10003
10004IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10005{
10006 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
10007 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
10008 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
10009 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
10010 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
10011 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
10012 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
10013 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
10014}
10015
10016
10017IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10018{
10019 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
10020 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
10021 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
10022 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
10023 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
10024 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
10025 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
10026 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
10027 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
10028 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
10029 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
10030 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
10031 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
10032 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
10033 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
10034 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
10035}
10036
10037
10038/*
10039 * PSRLW / VPSRLW
10040 */
10041#ifdef IEM_WITHOUT_ASSEMBLY
10042
10043IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10044{
10045 RTUINT64U uSrc1 = { *puDst };
10046 RTUINT64U uSrc2 = { *puSrc };
10047 RTUINT64U uDst;
10048
10049 if (uSrc2.au64[0] <= 15)
10050 {
10051 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
10052 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
10053 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
10054 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
10055 }
10056 else
10057 {
10058 uDst.au64[0] = 0;
10059 }
10060 *puDst = uDst.u;
10061}
10062
10063
10064IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10065{
10066 RTUINT64U uSrc1 = { *puDst };
10067 RTUINT64U uDst;
10068
10069 if (uShift <= 15)
10070 {
10071 uDst.au16[0] = uSrc1.au16[0] >> uShift;
10072 uDst.au16[1] = uSrc1.au16[1] >> uShift;
10073 uDst.au16[2] = uSrc1.au16[2] >> uShift;
10074 uDst.au16[3] = uSrc1.au16[3] >> uShift;
10075 }
10076 else
10077 {
10078 uDst.au64[0] = 0;
10079 }
10080 *puDst = uDst.u;
10081}
10082
10083
10084IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10085{
10086 RTUINT128U uSrc1 = *puDst;
10087
10088 if (puSrc->au64[0] <= 15)
10089 {
10090 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
10091 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
10092 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
10093 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
10094 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
10095 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
10096 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
10097 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
10098 }
10099 else
10100 {
10101 puDst->au64[0] = 0;
10102 puDst->au64[1] = 0;
10103 }
10104}
10105
10106IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10107{
10108 RTUINT128U uSrc1 = *puDst;
10109
10110 if (uShift <= 15)
10111 {
10112 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10113 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10114 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10115 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10116 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10117 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10118 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10119 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10120 }
10121 else
10122 {
10123 puDst->au64[0] = 0;
10124 puDst->au64[1] = 0;
10125 }
10126}
10127
10128#endif
10129
10130
10131/*
10132 * PSRAW / VPSRAW
10133 */
10134#ifdef IEM_WITHOUT_ASSEMBLY
10135
10136IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10137{
10138 RTUINT64U uSrc1 = { *puDst };
10139 RTUINT64U uSrc2 = { *puSrc };
10140 RTUINT64U uDst;
10141
10142 if (uSrc2.au64[0] <= 15)
10143 {
10144 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
10145 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
10146 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
10147 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
10148 }
10149 else
10150 {
10151 uDst.au64[0] = 0;
10152 }
10153 *puDst = uDst.u;
10154}
10155
10156
10157IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10158{
10159 RTUINT64U uSrc1 = { *puDst };
10160 RTUINT64U uDst;
10161
10162 if (uShift <= 15)
10163 {
10164 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10165 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10166 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10167 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10168 }
10169 else
10170 {
10171 uDst.au64[0] = 0;
10172 }
10173 *puDst = uDst.u;
10174}
10175
10176
10177IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10178{
10179 RTUINT128U uSrc1 = *puDst;
10180
10181 if (puSrc->au64[0] <= 15)
10182 {
10183 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
10184 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
10185 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
10186 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
10187 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
10188 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
10189 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
10190 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
10191 }
10192 else
10193 {
10194 puDst->au64[0] = 0;
10195 puDst->au64[1] = 0;
10196 }
10197}
10198
10199IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10200{
10201 RTUINT128U uSrc1 = *puDst;
10202
10203 if (uShift <= 15)
10204 {
10205 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10206 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10207 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10208 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10209 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10210 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10211 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10212 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10213 }
10214 else
10215 {
10216 puDst->au64[0] = 0;
10217 puDst->au64[1] = 0;
10218 }
10219}
10220
10221#endif
10222
10223
10224/*
10225 * PSLLW / VPSLLW
10226 */
10227#ifdef IEM_WITHOUT_ASSEMBLY
10228
10229IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10230{
10231 RTUINT64U uSrc1 = { *puDst };
10232 RTUINT64U uSrc2 = { *puSrc };
10233 RTUINT64U uDst;
10234
10235 if (uSrc2.au64[0] <= 15)
10236 {
10237 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10238 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10239 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10240 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10241 }
10242 else
10243 {
10244 uDst.au64[0] = 0;
10245 }
10246 *puDst = uDst.u;
10247}
10248
10249
10250IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10251{
10252 RTUINT64U uSrc1 = { *puDst };
10253 RTUINT64U uDst;
10254
10255 if (uShift <= 15)
10256 {
10257 uDst.au16[0] = uSrc1.au16[0] << uShift;
10258 uDst.au16[1] = uSrc1.au16[1] << uShift;
10259 uDst.au16[2] = uSrc1.au16[2] << uShift;
10260 uDst.au16[3] = uSrc1.au16[3] << uShift;
10261 }
10262 else
10263 {
10264 uDst.au64[0] = 0;
10265 }
10266 *puDst = uDst.u;
10267}
10268
10269
10270IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10271{
10272 RTUINT128U uSrc1 = *puDst;
10273
10274 if (puSrc->au64[0] <= 15)
10275 {
10276 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10277 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10278 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10279 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10280 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10281 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10282 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10283 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10284 }
10285 else
10286 {
10287 puDst->au64[0] = 0;
10288 puDst->au64[1] = 0;
10289 }
10290}
10291
10292IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10293{
10294 RTUINT128U uSrc1 = *puDst;
10295
10296 if (uShift <= 15)
10297 {
10298 puDst->au16[0] = uSrc1.au16[0] << uShift;
10299 puDst->au16[1] = uSrc1.au16[1] << uShift;
10300 puDst->au16[2] = uSrc1.au16[2] << uShift;
10301 puDst->au16[3] = uSrc1.au16[3] << uShift;
10302 puDst->au16[4] = uSrc1.au16[4] << uShift;
10303 puDst->au16[5] = uSrc1.au16[5] << uShift;
10304 puDst->au16[6] = uSrc1.au16[6] << uShift;
10305 puDst->au16[7] = uSrc1.au16[7] << uShift;
10306 }
10307 else
10308 {
10309 puDst->au64[0] = 0;
10310 puDst->au64[1] = 0;
10311 }
10312}
10313
10314#endif
10315
10316
10317/*
10318 * PSRLD / VPSRLD
10319 */
10320#ifdef IEM_WITHOUT_ASSEMBLY
10321
10322IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10323{
10324 RTUINT64U uSrc1 = { *puDst };
10325 RTUINT64U uSrc2 = { *puSrc };
10326 RTUINT64U uDst;
10327
10328 if (uSrc2.au64[0] <= 31)
10329 {
10330 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10331 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10332 }
10333 else
10334 {
10335 uDst.au64[0] = 0;
10336 }
10337 *puDst = uDst.u;
10338}
10339
10340
10341IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10342{
10343 RTUINT64U uSrc1 = { *puDst };
10344 RTUINT64U uDst;
10345
10346 if (uShift <= 31)
10347 {
10348 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10349 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10350 }
10351 else
10352 {
10353 uDst.au64[0] = 0;
10354 }
10355 *puDst = uDst.u;
10356}
10357
10358
10359IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10360{
10361 RTUINT128U uSrc1 = *puDst;
10362
10363 if (puSrc->au64[0] <= 31)
10364 {
10365 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10366 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10367 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10368 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10369 }
10370 else
10371 {
10372 puDst->au64[0] = 0;
10373 puDst->au64[1] = 0;
10374 }
10375}
10376
10377IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10378{
10379 RTUINT128U uSrc1 = *puDst;
10380
10381 if (uShift <= 31)
10382 {
10383 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10384 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10385 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10386 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10387 }
10388 else
10389 {
10390 puDst->au64[0] = 0;
10391 puDst->au64[1] = 0;
10392 }
10393}
10394
10395#endif
10396
10397
10398/*
10399 * PSRAD / VPSRAD
10400 */
10401#ifdef IEM_WITHOUT_ASSEMBLY
10402
10403IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10404{
10405 RTUINT64U uSrc1 = { *puDst };
10406 RTUINT64U uSrc2 = { *puSrc };
10407 RTUINT64U uDst;
10408
10409 if (uSrc2.au64[0] <= 31)
10410 {
10411 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
10412 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
10413 }
10414 else
10415 {
10416 uDst.au64[0] = 0;
10417 }
10418 *puDst = uDst.u;
10419}
10420
10421
10422IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10423{
10424 RTUINT64U uSrc1 = { *puDst };
10425 RTUINT64U uDst;
10426
10427 if (uShift <= 31)
10428 {
10429 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10430 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10431 }
10432 else
10433 {
10434 uDst.au64[0] = 0;
10435 }
10436 *puDst = uDst.u;
10437}
10438
10439
10440IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10441{
10442 RTUINT128U uSrc1 = *puDst;
10443
10444 if (puSrc->au64[0] <= 31)
10445 {
10446 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
10447 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
10448 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
10449 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
10450 }
10451 else
10452 {
10453 puDst->au64[0] = 0;
10454 puDst->au64[1] = 0;
10455 }
10456}
10457
10458IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10459{
10460 RTUINT128U uSrc1 = *puDst;
10461
10462 if (uShift <= 31)
10463 {
10464 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10465 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10466 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10467 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10468 }
10469 else
10470 {
10471 puDst->au64[0] = 0;
10472 puDst->au64[1] = 0;
10473 }
10474}
10475
10476#endif
10477
10478
10479/*
10480 * PSLLD / VPSLLD
10481 */
10482#ifdef IEM_WITHOUT_ASSEMBLY
10483
10484IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10485{
10486 RTUINT64U uSrc1 = { *puDst };
10487 RTUINT64U uSrc2 = { *puSrc };
10488 RTUINT64U uDst;
10489
10490 if (uSrc2.au64[0] <= 31)
10491 {
10492 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10493 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10494 }
10495 else
10496 {
10497 uDst.au64[0] = 0;
10498 }
10499 *puDst = uDst.u;
10500}
10501
10502
10503IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10504{
10505 RTUINT64U uSrc1 = { *puDst };
10506 RTUINT64U uDst;
10507
10508 if (uShift <= 31)
10509 {
10510 uDst.au32[0] = uSrc1.au32[0] << uShift;
10511 uDst.au32[1] = uSrc1.au32[1] << uShift;
10512 }
10513 else
10514 {
10515 uDst.au64[0] = 0;
10516 }
10517 *puDst = uDst.u;
10518}
10519
10520
10521IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10522{
10523 RTUINT128U uSrc1 = *puDst;
10524
10525 if (puSrc->au64[0] <= 31)
10526 {
10527 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10528 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10529 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10530 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10531 }
10532 else
10533 {
10534 puDst->au64[0] = 0;
10535 puDst->au64[1] = 0;
10536 }
10537}
10538
10539IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10540{
10541 RTUINT128U uSrc1 = *puDst;
10542
10543 if (uShift <= 31)
10544 {
10545 puDst->au32[0] = uSrc1.au32[0] << uShift;
10546 puDst->au32[1] = uSrc1.au32[1] << uShift;
10547 puDst->au32[2] = uSrc1.au32[2] << uShift;
10548 puDst->au32[3] = uSrc1.au32[3] << uShift;
10549 }
10550 else
10551 {
10552 puDst->au64[0] = 0;
10553 puDst->au64[1] = 0;
10554 }
10555}
10556
10557#endif
10558
10559
10560/*
10561 * PSRLQ / VPSRLQ
10562 */
10563#ifdef IEM_WITHOUT_ASSEMBLY
10564
10565IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10566{
10567 RTUINT64U uSrc1 = { *puDst };
10568 RTUINT64U uSrc2 = { *puSrc };
10569 RTUINT64U uDst;
10570
10571 if (uSrc2.au64[0] <= 63)
10572 {
10573 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10574 }
10575 else
10576 {
10577 uDst.au64[0] = 0;
10578 }
10579 *puDst = uDst.u;
10580}
10581
10582
10583IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10584{
10585 RTUINT64U uSrc1 = { *puDst };
10586 RTUINT64U uDst;
10587
10588 if (uShift <= 63)
10589 {
10590 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10591 }
10592 else
10593 {
10594 uDst.au64[0] = 0;
10595 }
10596 *puDst = uDst.u;
10597}
10598
10599
10600IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10601{
10602 RTUINT128U uSrc1 = *puDst;
10603
10604 if (puSrc->au64[0] <= 63)
10605 {
10606 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10607 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10608 }
10609 else
10610 {
10611 puDst->au64[0] = 0;
10612 puDst->au64[1] = 0;
10613 }
10614}
10615
10616IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10617{
10618 RTUINT128U uSrc1 = *puDst;
10619
10620 if (uShift <= 63)
10621 {
10622 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10623 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10624 }
10625 else
10626 {
10627 puDst->au64[0] = 0;
10628 puDst->au64[1] = 0;
10629 }
10630}
10631
10632#endif
10633
10634
10635/*
10636 * PSLLQ / VPSLLQ
10637 */
10638#ifdef IEM_WITHOUT_ASSEMBLY
10639
10640IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10641{
10642 RTUINT64U uSrc1 = { *puDst };
10643 RTUINT64U uSrc2 = { *puSrc };
10644 RTUINT64U uDst;
10645
10646 if (uSrc2.au64[0] <= 63)
10647 {
10648 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10649 }
10650 else
10651 {
10652 uDst.au64[0] = 0;
10653 }
10654 *puDst = uDst.u;
10655}
10656
10657
10658IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10659{
10660 RTUINT64U uSrc1 = { *puDst };
10661 RTUINT64U uDst;
10662
10663 if (uShift <= 63)
10664 {
10665 uDst.au64[0] = uSrc1.au64[0] << uShift;
10666 }
10667 else
10668 {
10669 uDst.au64[0] = 0;
10670 }
10671 *puDst = uDst.u;
10672}
10673
10674
10675IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10676{
10677 RTUINT128U uSrc1 = *puDst;
10678
10679 if (puSrc->au64[0] <= 63)
10680 {
10681 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10682 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10683 }
10684 else
10685 {
10686 puDst->au64[0] = 0;
10687 puDst->au64[1] = 0;
10688 }
10689}
10690
10691IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10692{
10693 RTUINT128U uSrc1 = *puDst;
10694
10695 if (uShift <= 63)
10696 {
10697 puDst->au64[0] = uSrc1.au64[0] << uShift;
10698 puDst->au64[1] = uSrc1.au64[1] << uShift;
10699 }
10700 else
10701 {
10702 puDst->au64[0] = 0;
10703 puDst->au64[1] = 0;
10704 }
10705}
10706
10707#endif
10708
10709
10710/*
10711 * PSRLDQ / VPSRLDQ
10712 */
10713#ifdef IEM_WITHOUT_ASSEMBLY
10714
10715IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10716{
10717 RTUINT128U uSrc1 = *puDst;
10718
10719 if (uShift < 16)
10720 {
10721 int i;
10722
10723 for (i = 0; i < 16 - uShift; ++i)
10724 puDst->au8[i] = uSrc1.au8[i + uShift];
10725 for (i = 16 - uShift; i < 16; ++i)
10726 puDst->au8[i] = 0;
10727 }
10728 else
10729 {
10730 puDst->au64[0] = 0;
10731 puDst->au64[1] = 0;
10732 }
10733}
10734
10735#endif
10736
10737
10738/*
10739 * PSLLDQ / VPSLLDQ
10740 */
10741#ifdef IEM_WITHOUT_ASSEMBLY
10742
10743IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10744{
10745 RTUINT128U uSrc1 = *puDst;
10746
10747 if (uShift < 16)
10748 {
10749 int i;
10750
10751 for (i = 0; i < uShift; ++i)
10752 puDst->au8[i] = 0;
10753 for (i = uShift; i < 16; ++i)
10754 puDst->au8[i] = uSrc1.au8[i - uShift];
10755 }
10756 else
10757 {
10758 puDst->au64[0] = 0;
10759 puDst->au64[1] = 0;
10760 }
10761}
10762
10763#endif
10764
10765
10766/*
10767 * PMADDWD / VPMADDWD
10768 */
10769#ifdef IEM_WITHOUT_ASSEMBLY
10770
10771IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10772{
10773 RTUINT64U uSrc1 = { *puDst };
10774 RTUINT64U uSrc2 = { *puSrc };
10775 RTUINT64U uDst;
10776
10777 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10778 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10779 *puDst = uDst.u;
10780 RT_NOREF(pFpuState);
10781}
10782
10783
10784IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10785{
10786 RTUINT128U uSrc1 = *puDst;
10787
10788 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10789 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10790 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10791 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10792 RT_NOREF(pFpuState);
10793}
10794
10795#endif
10796
10797
10798/*
10799 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10800 */
10801#ifdef IEM_WITHOUT_ASSEMBLY
10802
10803IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10804{
10805 RTUINT64U uSrc1 = { *puDst };
10806 RTUINT64U uSrc2 = { *puSrc };
10807 RTUINT64U uDst;
10808
10809 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10810 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10811 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10812 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10813 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10814 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10815 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10816 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10817 *puDst = uDst.u;
10818 RT_NOREF(pFpuState);
10819}
10820
10821
10822IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10823{
10824 RTUINT128U uSrc1 = *puDst;
10825
10826 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10827 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10828 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10829 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10830 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10831 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10832 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10833 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10834 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10835 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10836 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10837 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10838 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10839 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10840 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10841 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10842 RT_NOREF(pFpuState);
10843}
10844
10845#endif
10846
10847
10848IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10849{
10850 RTUINT128U uSrc1 = *puDst;
10851
10852 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10853 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10854 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10855 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10856 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10857 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10858 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10859 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10860 RT_NOREF(pFpuState);
10861}
10862
10863
10864IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10865{
10866 RTUINT128U uSrc1 = *puDst;
10867
10868 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10869 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10870 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10871 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10872 RT_NOREF(pFpuState);
10873}
10874
10875
10876IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10877 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10878{
10879 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10880 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10881 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10882 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10883 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10884 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10885 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10886 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10887 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10888 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10889 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10890 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10891 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10892 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10893 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10894 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10895 RT_NOREF(pExtState);
10896}
10897
10898
10899IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10900 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10901{
10902 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10903 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10904 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10905 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10906 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10907 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10908 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10909 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10910 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10911 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10912 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10913 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10914 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10915 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10916 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10917 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10918 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10919 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10920 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10921 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10922 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10923 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10924 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10925 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10926 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10927 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10928 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10929 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10930 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10931 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10932 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10933 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10934 RT_NOREF(pExtState);
10935}
10936
10937
10938IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10939 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10940{
10941 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10942 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10943 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10944 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10945 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10946 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10947 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10948 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10949 RT_NOREF(pExtState);
10950}
10951
10952
10953IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10954 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10955{
10956 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10957 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10958 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10959 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10960 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10961 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10962 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10963 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10964 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10965 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10966 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10967 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10968 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10969 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10970 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10971 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10972 RT_NOREF(pExtState);
10973}
10974
10975
10976IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10977 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10978{
10979 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10980 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10981 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10982 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10983 RT_NOREF(pExtState);
10984}
10985
10986
10987IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10988 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10989{
10990 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10991 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10992 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10993 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10994 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10995 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10996 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10997 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10998 RT_NOREF(pExtState);
10999}
11000
11001
11002/*
11003 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11004 */
11005#ifdef IEM_WITHOUT_ASSEMBLY
11006
11007IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11008{
11009 RTUINT64U uSrc1 = { *puDst };
11010 RTUINT64U uSrc2 = { *puSrc };
11011 RTUINT64U uDst;
11012
11013 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11014 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11015 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11016 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11017 *puDst = uDst.u;
11018 RT_NOREF(pFpuState);
11019}
11020
11021
11022IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11023{
11024 RTUINT128U uSrc1 = *puDst;
11025
11026 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11027 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11028 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11029 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11030 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11031 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11032 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11033 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11034 RT_NOREF(pFpuState);
11035}
11036
11037#endif
11038
11039IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11040{
11041 RTUINT128U uSrc1 = *puDst;
11042
11043 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11044 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11045 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11046 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11047 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11048 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11049 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11050 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11051 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11052 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11053 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11054 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11055 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11056 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11057 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11058 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11059 RT_NOREF(pFpuState);
11060}
11061
11062
11063IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11064{
11065 RTUINT128U uSrc1 = *puDst;
11066
11067 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11068 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11069 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11070 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11071 RT_NOREF(pFpuState);
11072}
11073
11074
11075IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11076 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11077{
11078 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11079 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11080 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11081 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11082 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11083 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11084 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11085 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11086 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11087 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11088 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11089 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11090 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11091 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11092 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11093 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11094 RT_NOREF(pExtState);
11095}
11096
11097
11098IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11099 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11100{
11101 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11102 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11103 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11104 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11105 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11106 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11107 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11108 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11109 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11110 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11111 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11112 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11113 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11114 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11115 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11116 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11117 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11118 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11119 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11120 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11121 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11122 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11123 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11124 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11125 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11126 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11127 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11128 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11129 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11130 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11131 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11132 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11133 RT_NOREF(pExtState);
11134}
11135
11136
11137IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11138 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11139{
11140 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11141 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11142 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11143 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11144 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11145 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11146 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11147 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11148 RT_NOREF(pExtState);
11149}
11150
11151
11152IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11153 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11154{
11155 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11156 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11157 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11158 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11159 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11160 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11161 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11162 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11163 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11164 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11165 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11166 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11167 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11168 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11169 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11170 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11171 RT_NOREF(pExtState);
11172}
11173
11174
11175IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11176 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11177{
11178 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11179 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11180 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11181 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11182 RT_NOREF(pExtState);
11183}
11184
11185
11186IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11187 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11188{
11189 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11190 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11191 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11192 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11193 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11194 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11195 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11196 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11197 RT_NOREF(pExtState);
11198}
11199
11200
11201/*
11202 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11203 */
11204#ifdef IEM_WITHOUT_ASSEMBLY
11205
11206IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11207{
11208 RTUINT64U uSrc1 = { *puDst };
11209 RTUINT64U uSrc2 = { *puSrc };
11210 RTUINT64U uDst;
11211
11212 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11213 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11214 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11215 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11216 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11217 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11218 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11219 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11220 *puDst = uDst.u;
11221 RT_NOREF(pFpuState);
11222}
11223
11224
11225IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11226{
11227 RTUINT128U uSrc1 = *puDst;
11228
11229 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11230 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11231 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11232 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11233 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11234 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11235 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11236 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11237 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11238 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11239 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11240 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11241 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11242 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11243 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11244 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11245 RT_NOREF(pFpuState);
11246}
11247
11248#endif
11249
11250IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11251{
11252 RTUINT128U uSrc1 = *puDst;
11253
11254 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11255 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11256 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11257 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11258 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11259 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11260 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11261 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11262 RT_NOREF(pFpuState);
11263}
11264
11265
11266IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11267{
11268 RTUINT128U uSrc1 = *puDst;
11269
11270 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11271 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11272 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11273 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11274 RT_NOREF(pFpuState);
11275}
11276
11277
11278IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11279 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11280{
11281 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11282 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11283 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11284 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11285 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11286 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11287 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11288 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11289 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11290 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11291 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11292 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11293 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11294 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11295 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11296 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11297 RT_NOREF(pExtState);
11298}
11299
11300
11301IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11302 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11303{
11304 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11305 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11306 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11307 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11308 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11309 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11310 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11311 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11312 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11313 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11314 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11315 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11316 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11317 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11318 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11319 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11320 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11321 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11322 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11323 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11324 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11325 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11326 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11327 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11328 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11329 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11330 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11331 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11332 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11333 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11334 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11335 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11336 RT_NOREF(pExtState);
11337}
11338
11339
11340IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11341 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11342{
11343 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11344 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11345 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11346 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11347 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11348 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11349 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11350 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11351 RT_NOREF(pExtState);
11352}
11353
11354
11355IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11356 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11357{
11358 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11359 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11360 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11361 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11362 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11363 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11364 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11365 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11366 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11367 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11368 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11369 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11370 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11371 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11372 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11373 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11374 RT_NOREF(pExtState);
11375}
11376
11377
11378IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11379 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11380{
11381 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11382 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11383 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11384 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11385 RT_NOREF(pExtState);
11386}
11387
11388
11389IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11390 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11391{
11392 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11393 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11394 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11395 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11396 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11397 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11398 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11399 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11400 RT_NOREF(pExtState);
11401}
11402
11403
11404/*
11405 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11406 */
11407#ifdef IEM_WITHOUT_ASSEMBLY
11408
11409IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11410{
11411 RTUINT64U uSrc1 = { *puDst };
11412 RTUINT64U uSrc2 = { *puSrc };
11413 RTUINT64U uDst;
11414
11415 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11416 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11417 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11418 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11419 *puDst = uDst.u;
11420 RT_NOREF(pFpuState);
11421}
11422
11423
11424IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11425{
11426 RTUINT128U uSrc1 = *puDst;
11427
11428 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11429 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11430 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11431 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11432 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11433 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11434 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11435 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11436 RT_NOREF(pFpuState);
11437}
11438
11439#endif
11440
11441IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11442{
11443 RTUINT128U uSrc1 = *puDst;
11444
11445 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11446 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11447 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11448 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11449 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11450 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11451 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11452 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11453 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11454 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11455 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11456 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11457 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11458 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11459 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
11460 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
11461 RT_NOREF(pFpuState);
11462}
11463
11464
11465IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11466{
11467 RTUINT128U uSrc1 = *puDst;
11468
11469 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11470 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11471 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11472 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11473 RT_NOREF(pFpuState);
11474}
11475
11476
11477IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11478 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11479{
11480 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11481 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11482 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11483 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11484 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11485 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11486 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11487 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11488 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11489 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11490 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11491 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11492 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11493 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11494 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11495 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11496 RT_NOREF(pExtState);
11497}
11498
11499
11500IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11501 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11502{
11503 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11504 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11505 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11506 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11507 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11508 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11509 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11510 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11511 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11512 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11513 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11514 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11515 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11516 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11517 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11518 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11519 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11520 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11521 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11522 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11523 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11524 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11525 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11526 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11527 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11528 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11529 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11530 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11531 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11532 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11533 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11534 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11535 RT_NOREF(pExtState);
11536}
11537
11538
11539IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11540 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11541{
11542 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11543 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11544 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11545 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11546 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11547 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11548 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11549 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11550 RT_NOREF(pExtState);
11551}
11552
11553
11554IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11555 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11556{
11557 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11558 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11559 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11560 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11561 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11562 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11563 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11564 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11565 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11566 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11567 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
11568 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
11569 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
11570 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
11571 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
11572 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
11573 RT_NOREF(pExtState);
11574}
11575
11576
11577IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11578 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11579{
11580 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11581 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11582 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11583 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11584 RT_NOREF(pExtState);
11585}
11586
11587
11588IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11589 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11590{
11591 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11592 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11593 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11594 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11595 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11596 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11597 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11598 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11599 RT_NOREF(pExtState);
11600}
11601
11602
11603/*
11604 * PAVGB / VPAVGB / PAVGW / VPAVGW
11605 */
11606#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11607#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11608
11609#ifdef IEM_WITHOUT_ASSEMBLY
11610
11611IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11612{
11613 RTUINT64U uSrc1 = { *puDst };
11614 RTUINT64U uSrc2 = { *puSrc };
11615 RTUINT64U uDst;
11616
11617 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11618 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11619 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11620 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11621 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11622 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11623 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11624 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11625 *puDst = uDst.u;
11626}
11627
11628
11629IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11630{
11631 RTUINT128U uSrc1 = *puDst;
11632
11633 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11634 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11635 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11636 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11637 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11638 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11639 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11640 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11641 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11642 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11643 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11644 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11645 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11646 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11647 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11648 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11649}
11650
11651
11652IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11653{
11654 RTUINT64U uSrc1 = { *puDst };
11655 RTUINT64U uSrc2 = { *puSrc };
11656 RTUINT64U uDst;
11657
11658 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11659 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11660 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11661 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11662 *puDst = uDst.u;
11663}
11664
11665
11666IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11667{
11668 RTUINT128U uSrc1 = *puDst;
11669
11670 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11671 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11672 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11673 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11674 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11675 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11676 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11677 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11678}
11679
11680#endif
11681
11682IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11683{
11684 RTUINT128U uSrc1 = *puDst;
11685
11686 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11687 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11688 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11689 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11690 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11691 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11692 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11693 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11694 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11695 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11696 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11697 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11698 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11699 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11700 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11701 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11702}
11703
11704
11705IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11706{
11707 RTUINT128U uSrc1 = *puDst;
11708
11709 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11710 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11711 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11712 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11713 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11714 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11715 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11716 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11717 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11718 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11719 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11720 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11721 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11722 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11723 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11724 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11725}
11726
11727
11728IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11729{
11730 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11731 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11732 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11733 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11734 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11735 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11736 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11737 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11738 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11739 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11740 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11741 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11742 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11743 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11744 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11745 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11746}
11747
11748
11749IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11750{
11751 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11752 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11753 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11754 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11755 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11756 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11757 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11758 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11759 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11760 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11761 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11762 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11763 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11764 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11765 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11766 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11767 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11768 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11769 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11770 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11771 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11772 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11773 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11774 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11775 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11776 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11777 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11778 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11779 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11780 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11781 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11782 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11783}
11784
11785
11786IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11787{
11788 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11789 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11790 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11791 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11792 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11793 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11794 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11795 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11796}
11797
11798
11799IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11800{
11801 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11802 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11803 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11804 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11805 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11806 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11807 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11808 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11809 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11810 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11811 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11812 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11813 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11814 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11815 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11816 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11817}
11818
11819#undef PAVGB_EXEC
11820#undef PAVGW_EXEC
11821
11822
11823/*
11824 * PMOVMSKB / VPMOVMSKB
11825 */
11826#ifdef IEM_WITHOUT_ASSEMBLY
11827
11828IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11829{
11830 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11831 uint64_t const uSrc = *pu64Src;
11832 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11833 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11834 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11835 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11836 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11837 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11838 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11839 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11840}
11841
11842
11843IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11844{
11845 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11846 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11847 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11848 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11849 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11850 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11851 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11852 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11853 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11854 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11855 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11856 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11857 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11858 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11859 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11860 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11861 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11862 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11863 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11864}
11865
11866#endif
11867
11868IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11869{
11870 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11871 uint64_t const uSrc0 = puSrc->QWords.qw0;
11872 uint64_t const uSrc1 = puSrc->QWords.qw1;
11873 uint64_t const uSrc2 = puSrc->QWords.qw2;
11874 uint64_t const uSrc3 = puSrc->QWords.qw3;
11875 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11876 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11877 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11878 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11879 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11880 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11881 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11882 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11883 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11884 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11885 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11886 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11887 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11888 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11889 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11890 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11891 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11892 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11893 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11894 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11895 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11896 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11897 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11898 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11899 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11900 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11901 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11902 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11903 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11904 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11905 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11906 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11907}
11908
11909
11910/*
11911 * [V]PSHUFB
11912 */
11913
11914IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11915{
11916 RTUINT64U const uSrc = { *puSrc };
11917 RTUINT64U const uDstIn = { *puDst };
11918 ASMCompilerBarrier();
11919 RTUINT64U uDstOut = { 0 };
11920 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11921 {
11922 uint8_t idxSrc = uSrc.au8[iByte];
11923 if (!(idxSrc & 0x80))
11924 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11925 }
11926 *puDst = uDstOut.u;
11927 RT_NOREF(pFpuState);
11928}
11929
11930
11931IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11932{
11933 RTUINT128U const uSrc = *puSrc;
11934 RTUINT128U const uDstIn = *puDst;
11935 ASMCompilerBarrier();
11936 puDst->au64[0] = 0;
11937 puDst->au64[1] = 0;
11938 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11939 {
11940 uint8_t idxSrc = uSrc.au8[iByte];
11941 if (!(idxSrc & 0x80))
11942 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11943 }
11944 RT_NOREF(pFpuState);
11945}
11946
11947
11948IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11949 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11950{
11951 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11952 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11953 ASMCompilerBarrier();
11954 puDst->au64[0] = 0;
11955 puDst->au64[1] = 0;
11956 for (unsigned iByte = 0; iByte < 16; iByte++)
11957 {
11958 uint8_t idxSrc = uSrc2.au8[iByte];
11959 if (!(idxSrc & 0x80))
11960 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11961 }
11962 RT_NOREF(pExtState);
11963}
11964
11965
11966IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11967 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11968{
11969 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11970 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11971 ASMCompilerBarrier();
11972 puDst->au64[0] = 0;
11973 puDst->au64[1] = 0;
11974 puDst->au64[2] = 0;
11975 puDst->au64[3] = 0;
11976 for (unsigned iByte = 0; iByte < 16; iByte++)
11977 {
11978 uint8_t idxSrc = uSrc2.au8[iByte];
11979 if (!(idxSrc & 0x80))
11980 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11981 }
11982 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11983 {
11984 uint8_t idxSrc = uSrc2.au8[iByte];
11985 if (!(idxSrc & 0x80))
11986 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11987 }
11988 RT_NOREF(pExtState);
11989}
11990
11991
11992/*
11993 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11994 */
11995#ifdef IEM_WITHOUT_ASSEMBLY
11996
11997IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11998{
11999 uint64_t const uSrc = *puSrc;
12000 ASMCompilerBarrier();
12001 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12002 uSrc >> (((bEvil >> 2) & 3) * 16),
12003 uSrc >> (((bEvil >> 4) & 3) * 16),
12004 uSrc >> (((bEvil >> 6) & 3) * 16));
12005}
12006
12007
12008IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12009{
12010 puDst->QWords.qw0 = puSrc->QWords.qw0;
12011 uint64_t const uSrc = puSrc->QWords.qw1;
12012 ASMCompilerBarrier();
12013 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12014 uSrc >> (((bEvil >> 2) & 3) * 16),
12015 uSrc >> (((bEvil >> 4) & 3) * 16),
12016 uSrc >> (((bEvil >> 6) & 3) * 16));
12017}
12018
12019#endif
12020
12021IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12022{
12023 puDst->QWords.qw0 = puSrc->QWords.qw0;
12024 uint64_t const uSrc1 = puSrc->QWords.qw1;
12025 puDst->QWords.qw2 = puSrc->QWords.qw2;
12026 uint64_t const uSrc3 = puSrc->QWords.qw3;
12027 ASMCompilerBarrier();
12028 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12029 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12030 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12031 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12032 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12033 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12034 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12035 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12036}
12037
12038#ifdef IEM_WITHOUT_ASSEMBLY
12039IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12040{
12041 puDst->QWords.qw1 = puSrc->QWords.qw1;
12042 uint64_t const uSrc = puSrc->QWords.qw0;
12043 ASMCompilerBarrier();
12044 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12045 uSrc >> (((bEvil >> 2) & 3) * 16),
12046 uSrc >> (((bEvil >> 4) & 3) * 16),
12047 uSrc >> (((bEvil >> 6) & 3) * 16));
12048
12049}
12050#endif
12051
12052
12053IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12054{
12055 puDst->QWords.qw3 = puSrc->QWords.qw3;
12056 uint64_t const uSrc2 = puSrc->QWords.qw2;
12057 puDst->QWords.qw1 = puSrc->QWords.qw1;
12058 uint64_t const uSrc0 = puSrc->QWords.qw0;
12059 ASMCompilerBarrier();
12060 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12061 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12062 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12063 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12064 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12065 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12066 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12067 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12068
12069}
12070
12071
12072#ifdef IEM_WITHOUT_ASSEMBLY
12073IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12074{
12075 RTUINT128U const uSrc = *puSrc;
12076 ASMCompilerBarrier();
12077 puDst->au32[0] = uSrc.au32[bEvil & 3];
12078 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12079 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12080 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12081}
12082#endif
12083
12084
12085IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12086{
12087 RTUINT256U const uSrc = *puSrc;
12088 ASMCompilerBarrier();
12089 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12090 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12091 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12092 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12093 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12094 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12095 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12096 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12097}
12098
12099
12100/*
12101 * PUNPCKHBW - high bytes -> words
12102 */
12103#ifdef IEM_WITHOUT_ASSEMBLY
12104
12105IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12106{
12107 RTUINT64U const uSrc2 = { *puSrc };
12108 RTUINT64U const uSrc1 = { *puDst };
12109 ASMCompilerBarrier();
12110 RTUINT64U uDstOut;
12111 uDstOut.au8[0] = uSrc1.au8[4];
12112 uDstOut.au8[1] = uSrc2.au8[4];
12113 uDstOut.au8[2] = uSrc1.au8[5];
12114 uDstOut.au8[3] = uSrc2.au8[5];
12115 uDstOut.au8[4] = uSrc1.au8[6];
12116 uDstOut.au8[5] = uSrc2.au8[6];
12117 uDstOut.au8[6] = uSrc1.au8[7];
12118 uDstOut.au8[7] = uSrc2.au8[7];
12119 *puDst = uDstOut.u;
12120}
12121
12122
12123IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12124{
12125 RTUINT128U const uSrc2 = *puSrc;
12126 RTUINT128U const uSrc1 = *puDst;
12127 ASMCompilerBarrier();
12128 RTUINT128U uDstOut;
12129 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12130 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12131 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12132 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12133 uDstOut.au8[ 4] = uSrc1.au8[10];
12134 uDstOut.au8[ 5] = uSrc2.au8[10];
12135 uDstOut.au8[ 6] = uSrc1.au8[11];
12136 uDstOut.au8[ 7] = uSrc2.au8[11];
12137 uDstOut.au8[ 8] = uSrc1.au8[12];
12138 uDstOut.au8[ 9] = uSrc2.au8[12];
12139 uDstOut.au8[10] = uSrc1.au8[13];
12140 uDstOut.au8[11] = uSrc2.au8[13];
12141 uDstOut.au8[12] = uSrc1.au8[14];
12142 uDstOut.au8[13] = uSrc2.au8[14];
12143 uDstOut.au8[14] = uSrc1.au8[15];
12144 uDstOut.au8[15] = uSrc2.au8[15];
12145 *puDst = uDstOut;
12146}
12147
12148#endif
12149
12150IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12151{
12152 RTUINT128U const uSrc2 = *puSrc2;
12153 RTUINT128U const uSrc1 = *puSrc1;
12154 ASMCompilerBarrier();
12155 RTUINT128U uDstOut;
12156 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12157 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12158 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12159 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12160 uDstOut.au8[ 4] = uSrc1.au8[10];
12161 uDstOut.au8[ 5] = uSrc2.au8[10];
12162 uDstOut.au8[ 6] = uSrc1.au8[11];
12163 uDstOut.au8[ 7] = uSrc2.au8[11];
12164 uDstOut.au8[ 8] = uSrc1.au8[12];
12165 uDstOut.au8[ 9] = uSrc2.au8[12];
12166 uDstOut.au8[10] = uSrc1.au8[13];
12167 uDstOut.au8[11] = uSrc2.au8[13];
12168 uDstOut.au8[12] = uSrc1.au8[14];
12169 uDstOut.au8[13] = uSrc2.au8[14];
12170 uDstOut.au8[14] = uSrc1.au8[15];
12171 uDstOut.au8[15] = uSrc2.au8[15];
12172 *puDst = uDstOut;
12173}
12174
12175
12176IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12177{
12178 RTUINT256U const uSrc2 = *puSrc2;
12179 RTUINT256U const uSrc1 = *puSrc1;
12180 ASMCompilerBarrier();
12181 RTUINT256U uDstOut;
12182 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12183 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12184 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12185 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12186 uDstOut.au8[ 4] = uSrc1.au8[10];
12187 uDstOut.au8[ 5] = uSrc2.au8[10];
12188 uDstOut.au8[ 6] = uSrc1.au8[11];
12189 uDstOut.au8[ 7] = uSrc2.au8[11];
12190 uDstOut.au8[ 8] = uSrc1.au8[12];
12191 uDstOut.au8[ 9] = uSrc2.au8[12];
12192 uDstOut.au8[10] = uSrc1.au8[13];
12193 uDstOut.au8[11] = uSrc2.au8[13];
12194 uDstOut.au8[12] = uSrc1.au8[14];
12195 uDstOut.au8[13] = uSrc2.au8[14];
12196 uDstOut.au8[14] = uSrc1.au8[15];
12197 uDstOut.au8[15] = uSrc2.au8[15];
12198 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12199 uDstOut.au8[16] = uSrc1.au8[24];
12200 uDstOut.au8[17] = uSrc2.au8[24];
12201 uDstOut.au8[18] = uSrc1.au8[25];
12202 uDstOut.au8[19] = uSrc2.au8[25];
12203 uDstOut.au8[20] = uSrc1.au8[26];
12204 uDstOut.au8[21] = uSrc2.au8[26];
12205 uDstOut.au8[22] = uSrc1.au8[27];
12206 uDstOut.au8[23] = uSrc2.au8[27];
12207 uDstOut.au8[24] = uSrc1.au8[28];
12208 uDstOut.au8[25] = uSrc2.au8[28];
12209 uDstOut.au8[26] = uSrc1.au8[29];
12210 uDstOut.au8[27] = uSrc2.au8[29];
12211 uDstOut.au8[28] = uSrc1.au8[30];
12212 uDstOut.au8[29] = uSrc2.au8[30];
12213 uDstOut.au8[30] = uSrc1.au8[31];
12214 uDstOut.au8[31] = uSrc2.au8[31];
12215 *puDst = uDstOut;
12216}
12217
12218
12219/*
12220 * PUNPCKHBW - high words -> dwords
12221 */
12222#ifdef IEM_WITHOUT_ASSEMBLY
12223
12224IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12225{
12226 RTUINT64U const uSrc2 = { *puSrc };
12227 RTUINT64U const uSrc1 = { *puDst };
12228 ASMCompilerBarrier();
12229 RTUINT64U uDstOut;
12230 uDstOut.au16[0] = uSrc1.au16[2];
12231 uDstOut.au16[1] = uSrc2.au16[2];
12232 uDstOut.au16[2] = uSrc1.au16[3];
12233 uDstOut.au16[3] = uSrc2.au16[3];
12234 *puDst = uDstOut.u;
12235}
12236
12237
12238IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12239{
12240 RTUINT128U const uSrc2 = *puSrc;
12241 RTUINT128U const uSrc1 = *puDst;
12242 ASMCompilerBarrier();
12243 RTUINT128U uDstOut;
12244 uDstOut.au16[0] = uSrc1.au16[4];
12245 uDstOut.au16[1] = uSrc2.au16[4];
12246 uDstOut.au16[2] = uSrc1.au16[5];
12247 uDstOut.au16[3] = uSrc2.au16[5];
12248 uDstOut.au16[4] = uSrc1.au16[6];
12249 uDstOut.au16[5] = uSrc2.au16[6];
12250 uDstOut.au16[6] = uSrc1.au16[7];
12251 uDstOut.au16[7] = uSrc2.au16[7];
12252 *puDst = uDstOut;
12253}
12254
12255#endif
12256
12257IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12258{
12259 RTUINT128U const uSrc2 = *puSrc2;
12260 RTUINT128U const uSrc1 = *puSrc1;
12261 ASMCompilerBarrier();
12262 RTUINT128U uDstOut;
12263 uDstOut.au16[0] = uSrc1.au16[4];
12264 uDstOut.au16[1] = uSrc2.au16[4];
12265 uDstOut.au16[2] = uSrc1.au16[5];
12266 uDstOut.au16[3] = uSrc2.au16[5];
12267 uDstOut.au16[4] = uSrc1.au16[6];
12268 uDstOut.au16[5] = uSrc2.au16[6];
12269 uDstOut.au16[6] = uSrc1.au16[7];
12270 uDstOut.au16[7] = uSrc2.au16[7];
12271 *puDst = uDstOut;
12272}
12273
12274
12275IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12276{
12277 RTUINT256U const uSrc2 = *puSrc2;
12278 RTUINT256U const uSrc1 = *puSrc1;
12279 ASMCompilerBarrier();
12280 RTUINT256U uDstOut;
12281 uDstOut.au16[0] = uSrc1.au16[4];
12282 uDstOut.au16[1] = uSrc2.au16[4];
12283 uDstOut.au16[2] = uSrc1.au16[5];
12284 uDstOut.au16[3] = uSrc2.au16[5];
12285 uDstOut.au16[4] = uSrc1.au16[6];
12286 uDstOut.au16[5] = uSrc2.au16[6];
12287 uDstOut.au16[6] = uSrc1.au16[7];
12288 uDstOut.au16[7] = uSrc2.au16[7];
12289
12290 uDstOut.au16[8] = uSrc1.au16[12];
12291 uDstOut.au16[9] = uSrc2.au16[12];
12292 uDstOut.au16[10] = uSrc1.au16[13];
12293 uDstOut.au16[11] = uSrc2.au16[13];
12294 uDstOut.au16[12] = uSrc1.au16[14];
12295 uDstOut.au16[13] = uSrc2.au16[14];
12296 uDstOut.au16[14] = uSrc1.au16[15];
12297 uDstOut.au16[15] = uSrc2.au16[15];
12298 *puDst = uDstOut;
12299}
12300
12301
12302/*
12303 * PUNPCKHBW - high dwords -> qword(s)
12304 */
12305#ifdef IEM_WITHOUT_ASSEMBLY
12306
12307IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12308{
12309 RTUINT64U const uSrc2 = { *puSrc };
12310 RTUINT64U const uSrc1 = { *puDst };
12311 ASMCompilerBarrier();
12312 RTUINT64U uDstOut;
12313 uDstOut.au32[0] = uSrc1.au32[1];
12314 uDstOut.au32[1] = uSrc2.au32[1];
12315 *puDst = uDstOut.u;
12316}
12317
12318
12319IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12320{
12321 RTUINT128U const uSrc2 = *puSrc;
12322 RTUINT128U const uSrc1 = *puDst;
12323 ASMCompilerBarrier();
12324 RTUINT128U uDstOut;
12325 uDstOut.au32[0] = uSrc1.au32[2];
12326 uDstOut.au32[1] = uSrc2.au32[2];
12327 uDstOut.au32[2] = uSrc1.au32[3];
12328 uDstOut.au32[3] = uSrc2.au32[3];
12329 *puDst = uDstOut;
12330}
12331
12332#endif
12333
12334IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12335{
12336 RTUINT128U const uSrc2 = *puSrc2;
12337 RTUINT128U const uSrc1 = *puSrc1;
12338 ASMCompilerBarrier();
12339 RTUINT128U uDstOut;
12340 uDstOut.au32[0] = uSrc1.au32[2];
12341 uDstOut.au32[1] = uSrc2.au32[2];
12342 uDstOut.au32[2] = uSrc1.au32[3];
12343 uDstOut.au32[3] = uSrc2.au32[3];
12344 *puDst = uDstOut;
12345}
12346
12347
12348IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12349{
12350 RTUINT256U const uSrc2 = *puSrc2;
12351 RTUINT256U const uSrc1 = *puSrc1;
12352 ASMCompilerBarrier();
12353 RTUINT256U uDstOut;
12354 uDstOut.au32[0] = uSrc1.au32[2];
12355 uDstOut.au32[1] = uSrc2.au32[2];
12356 uDstOut.au32[2] = uSrc1.au32[3];
12357 uDstOut.au32[3] = uSrc2.au32[3];
12358
12359 uDstOut.au32[4] = uSrc1.au32[6];
12360 uDstOut.au32[5] = uSrc2.au32[6];
12361 uDstOut.au32[6] = uSrc1.au32[7];
12362 uDstOut.au32[7] = uSrc2.au32[7];
12363 *puDst = uDstOut;
12364}
12365
12366
12367/*
12368 * PUNPCKHQDQ -> High qwords -> double qword(s).
12369 */
12370#ifdef IEM_WITHOUT_ASSEMBLY
12371IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12372{
12373 RTUINT128U const uSrc2 = *puSrc;
12374 RTUINT128U const uSrc1 = *puDst;
12375 ASMCompilerBarrier();
12376 RTUINT128U uDstOut;
12377 uDstOut.au64[0] = uSrc1.au64[1];
12378 uDstOut.au64[1] = uSrc2.au64[1];
12379 *puDst = uDstOut;
12380}
12381#endif
12382
12383
12384IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12385{
12386 RTUINT128U const uSrc2 = *puSrc2;
12387 RTUINT128U const uSrc1 = *puSrc1;
12388 ASMCompilerBarrier();
12389 RTUINT128U uDstOut;
12390 uDstOut.au64[0] = uSrc1.au64[1];
12391 uDstOut.au64[1] = uSrc2.au64[1];
12392 *puDst = uDstOut;
12393}
12394
12395
12396IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12397{
12398 RTUINT256U const uSrc2 = *puSrc2;
12399 RTUINT256U const uSrc1 = *puSrc1;
12400 ASMCompilerBarrier();
12401 RTUINT256U uDstOut;
12402 uDstOut.au64[0] = uSrc1.au64[1];
12403 uDstOut.au64[1] = uSrc2.au64[1];
12404
12405 uDstOut.au64[2] = uSrc1.au64[3];
12406 uDstOut.au64[3] = uSrc2.au64[3];
12407 *puDst = uDstOut;
12408}
12409
12410
12411/*
12412 * PUNPCKLBW - low bytes -> words
12413 */
12414#ifdef IEM_WITHOUT_ASSEMBLY
12415
12416IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12417{
12418 RTUINT64U const uSrc2 = { *puSrc };
12419 RTUINT64U const uSrc1 = { *puDst };
12420 ASMCompilerBarrier();
12421 RTUINT64U uDstOut;
12422 uDstOut.au8[0] = uSrc1.au8[0];
12423 uDstOut.au8[1] = uSrc2.au8[0];
12424 uDstOut.au8[2] = uSrc1.au8[1];
12425 uDstOut.au8[3] = uSrc2.au8[1];
12426 uDstOut.au8[4] = uSrc1.au8[2];
12427 uDstOut.au8[5] = uSrc2.au8[2];
12428 uDstOut.au8[6] = uSrc1.au8[3];
12429 uDstOut.au8[7] = uSrc2.au8[3];
12430 *puDst = uDstOut.u;
12431}
12432
12433
12434IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12435{
12436 RTUINT128U const uSrc2 = *puSrc;
12437 RTUINT128U const uSrc1 = *puDst;
12438 ASMCompilerBarrier();
12439 RTUINT128U uDstOut;
12440 uDstOut.au8[ 0] = uSrc1.au8[0];
12441 uDstOut.au8[ 1] = uSrc2.au8[0];
12442 uDstOut.au8[ 2] = uSrc1.au8[1];
12443 uDstOut.au8[ 3] = uSrc2.au8[1];
12444 uDstOut.au8[ 4] = uSrc1.au8[2];
12445 uDstOut.au8[ 5] = uSrc2.au8[2];
12446 uDstOut.au8[ 6] = uSrc1.au8[3];
12447 uDstOut.au8[ 7] = uSrc2.au8[3];
12448 uDstOut.au8[ 8] = uSrc1.au8[4];
12449 uDstOut.au8[ 9] = uSrc2.au8[4];
12450 uDstOut.au8[10] = uSrc1.au8[5];
12451 uDstOut.au8[11] = uSrc2.au8[5];
12452 uDstOut.au8[12] = uSrc1.au8[6];
12453 uDstOut.au8[13] = uSrc2.au8[6];
12454 uDstOut.au8[14] = uSrc1.au8[7];
12455 uDstOut.au8[15] = uSrc2.au8[7];
12456 *puDst = uDstOut;
12457}
12458
12459#endif
12460
12461IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12462{
12463 RTUINT128U const uSrc2 = *puSrc2;
12464 RTUINT128U const uSrc1 = *puSrc1;
12465 ASMCompilerBarrier();
12466 RTUINT128U uDstOut;
12467 uDstOut.au8[ 0] = uSrc1.au8[0];
12468 uDstOut.au8[ 1] = uSrc2.au8[0];
12469 uDstOut.au8[ 2] = uSrc1.au8[1];
12470 uDstOut.au8[ 3] = uSrc2.au8[1];
12471 uDstOut.au8[ 4] = uSrc1.au8[2];
12472 uDstOut.au8[ 5] = uSrc2.au8[2];
12473 uDstOut.au8[ 6] = uSrc1.au8[3];
12474 uDstOut.au8[ 7] = uSrc2.au8[3];
12475 uDstOut.au8[ 8] = uSrc1.au8[4];
12476 uDstOut.au8[ 9] = uSrc2.au8[4];
12477 uDstOut.au8[10] = uSrc1.au8[5];
12478 uDstOut.au8[11] = uSrc2.au8[5];
12479 uDstOut.au8[12] = uSrc1.au8[6];
12480 uDstOut.au8[13] = uSrc2.au8[6];
12481 uDstOut.au8[14] = uSrc1.au8[7];
12482 uDstOut.au8[15] = uSrc2.au8[7];
12483 *puDst = uDstOut;
12484}
12485
12486
12487IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12488{
12489 RTUINT256U const uSrc2 = *puSrc2;
12490 RTUINT256U const uSrc1 = *puSrc1;
12491 ASMCompilerBarrier();
12492 RTUINT256U uDstOut;
12493 uDstOut.au8[ 0] = uSrc1.au8[0];
12494 uDstOut.au8[ 1] = uSrc2.au8[0];
12495 uDstOut.au8[ 2] = uSrc1.au8[1];
12496 uDstOut.au8[ 3] = uSrc2.au8[1];
12497 uDstOut.au8[ 4] = uSrc1.au8[2];
12498 uDstOut.au8[ 5] = uSrc2.au8[2];
12499 uDstOut.au8[ 6] = uSrc1.au8[3];
12500 uDstOut.au8[ 7] = uSrc2.au8[3];
12501 uDstOut.au8[ 8] = uSrc1.au8[4];
12502 uDstOut.au8[ 9] = uSrc2.au8[4];
12503 uDstOut.au8[10] = uSrc1.au8[5];
12504 uDstOut.au8[11] = uSrc2.au8[5];
12505 uDstOut.au8[12] = uSrc1.au8[6];
12506 uDstOut.au8[13] = uSrc2.au8[6];
12507 uDstOut.au8[14] = uSrc1.au8[7];
12508 uDstOut.au8[15] = uSrc2.au8[7];
12509 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12510 uDstOut.au8[16] = uSrc1.au8[16];
12511 uDstOut.au8[17] = uSrc2.au8[16];
12512 uDstOut.au8[18] = uSrc1.au8[17];
12513 uDstOut.au8[19] = uSrc2.au8[17];
12514 uDstOut.au8[20] = uSrc1.au8[18];
12515 uDstOut.au8[21] = uSrc2.au8[18];
12516 uDstOut.au8[22] = uSrc1.au8[19];
12517 uDstOut.au8[23] = uSrc2.au8[19];
12518 uDstOut.au8[24] = uSrc1.au8[20];
12519 uDstOut.au8[25] = uSrc2.au8[20];
12520 uDstOut.au8[26] = uSrc1.au8[21];
12521 uDstOut.au8[27] = uSrc2.au8[21];
12522 uDstOut.au8[28] = uSrc1.au8[22];
12523 uDstOut.au8[29] = uSrc2.au8[22];
12524 uDstOut.au8[30] = uSrc1.au8[23];
12525 uDstOut.au8[31] = uSrc2.au8[23];
12526 *puDst = uDstOut;
12527}
12528
12529
12530/*
12531 * PUNPCKLBW - low words -> dwords
12532 */
12533#ifdef IEM_WITHOUT_ASSEMBLY
12534
12535IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12536{
12537 RTUINT64U const uSrc2 = { *puSrc };
12538 RTUINT64U const uSrc1 = { *puDst };
12539 ASMCompilerBarrier();
12540 RTUINT64U uDstOut;
12541 uDstOut.au16[0] = uSrc1.au16[0];
12542 uDstOut.au16[1] = uSrc2.au16[0];
12543 uDstOut.au16[2] = uSrc1.au16[1];
12544 uDstOut.au16[3] = uSrc2.au16[1];
12545 *puDst = uDstOut.u;
12546}
12547
12548
12549IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12550{
12551 RTUINT128U const uSrc2 = *puSrc;
12552 RTUINT128U const uSrc1 = *puDst;
12553 ASMCompilerBarrier();
12554 RTUINT128U uDstOut;
12555 uDstOut.au16[0] = uSrc1.au16[0];
12556 uDstOut.au16[1] = uSrc2.au16[0];
12557 uDstOut.au16[2] = uSrc1.au16[1];
12558 uDstOut.au16[3] = uSrc2.au16[1];
12559 uDstOut.au16[4] = uSrc1.au16[2];
12560 uDstOut.au16[5] = uSrc2.au16[2];
12561 uDstOut.au16[6] = uSrc1.au16[3];
12562 uDstOut.au16[7] = uSrc2.au16[3];
12563 *puDst = uDstOut;
12564}
12565
12566#endif
12567
12568IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12569{
12570 RTUINT128U const uSrc2 = *puSrc2;
12571 RTUINT128U const uSrc1 = *puSrc1;
12572 ASMCompilerBarrier();
12573 RTUINT128U uDstOut;
12574 uDstOut.au16[0] = uSrc1.au16[0];
12575 uDstOut.au16[1] = uSrc2.au16[0];
12576 uDstOut.au16[2] = uSrc1.au16[1];
12577 uDstOut.au16[3] = uSrc2.au16[1];
12578 uDstOut.au16[4] = uSrc1.au16[2];
12579 uDstOut.au16[5] = uSrc2.au16[2];
12580 uDstOut.au16[6] = uSrc1.au16[3];
12581 uDstOut.au16[7] = uSrc2.au16[3];
12582 *puDst = uDstOut;
12583}
12584
12585
12586IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12587{
12588 RTUINT256U const uSrc2 = *puSrc2;
12589 RTUINT256U const uSrc1 = *puSrc1;
12590 ASMCompilerBarrier();
12591 RTUINT256U uDstOut;
12592 uDstOut.au16[0] = uSrc1.au16[0];
12593 uDstOut.au16[1] = uSrc2.au16[0];
12594 uDstOut.au16[2] = uSrc1.au16[1];
12595 uDstOut.au16[3] = uSrc2.au16[1];
12596 uDstOut.au16[4] = uSrc1.au16[2];
12597 uDstOut.au16[5] = uSrc2.au16[2];
12598 uDstOut.au16[6] = uSrc1.au16[3];
12599 uDstOut.au16[7] = uSrc2.au16[3];
12600
12601 uDstOut.au16[8] = uSrc1.au16[8];
12602 uDstOut.au16[9] = uSrc2.au16[8];
12603 uDstOut.au16[10] = uSrc1.au16[9];
12604 uDstOut.au16[11] = uSrc2.au16[9];
12605 uDstOut.au16[12] = uSrc1.au16[10];
12606 uDstOut.au16[13] = uSrc2.au16[10];
12607 uDstOut.au16[14] = uSrc1.au16[11];
12608 uDstOut.au16[15] = uSrc2.au16[11];
12609 *puDst = uDstOut;
12610}
12611
12612
12613/*
12614 * PUNPCKLBW - low dwords -> qword(s)
12615 */
12616#ifdef IEM_WITHOUT_ASSEMBLY
12617
12618IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12619{
12620 RTUINT64U const uSrc2 = { *puSrc };
12621 RTUINT64U const uSrc1 = { *puDst };
12622 ASMCompilerBarrier();
12623 RTUINT64U uDstOut;
12624 uDstOut.au32[0] = uSrc1.au32[0];
12625 uDstOut.au32[1] = uSrc2.au32[0];
12626 *puDst = uDstOut.u;
12627}
12628
12629
12630IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12631{
12632 RTUINT128U const uSrc2 = *puSrc;
12633 RTUINT128U const uSrc1 = *puDst;
12634 ASMCompilerBarrier();
12635 RTUINT128U uDstOut;
12636 uDstOut.au32[0] = uSrc1.au32[0];
12637 uDstOut.au32[1] = uSrc2.au32[0];
12638 uDstOut.au32[2] = uSrc1.au32[1];
12639 uDstOut.au32[3] = uSrc2.au32[1];
12640 *puDst = uDstOut;
12641}
12642
12643#endif
12644
12645IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12646{
12647 RTUINT128U const uSrc2 = *puSrc2;
12648 RTUINT128U const uSrc1 = *puSrc1;
12649 ASMCompilerBarrier();
12650 RTUINT128U uDstOut;
12651 uDstOut.au32[0] = uSrc1.au32[0];
12652 uDstOut.au32[1] = uSrc2.au32[0];
12653 uDstOut.au32[2] = uSrc1.au32[1];
12654 uDstOut.au32[3] = uSrc2.au32[1];
12655 *puDst = uDstOut;
12656}
12657
12658
12659IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12660{
12661 RTUINT256U const uSrc2 = *puSrc2;
12662 RTUINT256U const uSrc1 = *puSrc1;
12663 ASMCompilerBarrier();
12664 RTUINT256U uDstOut;
12665 uDstOut.au32[0] = uSrc1.au32[0];
12666 uDstOut.au32[1] = uSrc2.au32[0];
12667 uDstOut.au32[2] = uSrc1.au32[1];
12668 uDstOut.au32[3] = uSrc2.au32[1];
12669
12670 uDstOut.au32[4] = uSrc1.au32[4];
12671 uDstOut.au32[5] = uSrc2.au32[4];
12672 uDstOut.au32[6] = uSrc1.au32[5];
12673 uDstOut.au32[7] = uSrc2.au32[5];
12674 *puDst = uDstOut;
12675}
12676
12677
12678/*
12679 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12680 */
12681#ifdef IEM_WITHOUT_ASSEMBLY
12682IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12683{
12684 RTUINT128U const uSrc2 = *puSrc;
12685 RTUINT128U const uSrc1 = *puDst;
12686 ASMCompilerBarrier();
12687 RTUINT128U uDstOut;
12688 uDstOut.au64[0] = uSrc1.au64[0];
12689 uDstOut.au64[1] = uSrc2.au64[0];
12690 *puDst = uDstOut;
12691}
12692#endif
12693
12694
12695IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12696{
12697 RTUINT128U const uSrc2 = *puSrc2;
12698 RTUINT128U const uSrc1 = *puSrc1;
12699 ASMCompilerBarrier();
12700 RTUINT128U uDstOut;
12701 uDstOut.au64[0] = uSrc1.au64[0];
12702 uDstOut.au64[1] = uSrc2.au64[0];
12703 *puDst = uDstOut;
12704}
12705
12706
12707IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12708{
12709 RTUINT256U const uSrc2 = *puSrc2;
12710 RTUINT256U const uSrc1 = *puSrc1;
12711 ASMCompilerBarrier();
12712 RTUINT256U uDstOut;
12713 uDstOut.au64[0] = uSrc1.au64[0];
12714 uDstOut.au64[1] = uSrc2.au64[0];
12715
12716 uDstOut.au64[2] = uSrc1.au64[2];
12717 uDstOut.au64[3] = uSrc2.au64[2];
12718 *puDst = uDstOut;
12719}
12720
12721
12722/*
12723 * PACKSSWB - signed words -> signed bytes
12724 */
12725
12726#ifdef IEM_WITHOUT_ASSEMBLY
12727
12728IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12729{
12730 RTUINT64U const uSrc2 = { *puSrc };
12731 RTUINT64U const uSrc1 = { *puDst };
12732 ASMCompilerBarrier();
12733 RTUINT64U uDstOut;
12734 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12735 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12736 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12737 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12738 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12739 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12740 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12741 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12742 *puDst = uDstOut.u;
12743}
12744
12745
12746IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12747{
12748 RTUINT128U const uSrc2 = *puSrc;
12749 RTUINT128U const uSrc1 = *puDst;
12750 ASMCompilerBarrier();
12751 RTUINT128U uDstOut;
12752 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12753 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12754 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12755 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12756 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12757 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12758 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12759 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12760 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12761 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12762 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12763 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12764 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12765 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12766 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12767 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12768 *puDst = uDstOut;
12769}
12770
12771#endif
12772
12773IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12774{
12775 RTUINT128U const uSrc2 = *puSrc2;
12776 RTUINT128U const uSrc1 = *puSrc1;
12777 ASMCompilerBarrier();
12778 RTUINT128U uDstOut;
12779 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12780 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12781 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12782 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12783 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12784 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12785 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12786 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12787 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12788 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12789 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12790 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12791 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12792 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12793 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12794 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12795 *puDst = uDstOut;
12796}
12797
12798
12799IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12800{
12801 RTUINT256U const uSrc2 = *puSrc2;
12802 RTUINT256U const uSrc1 = *puSrc1;
12803 ASMCompilerBarrier();
12804 RTUINT256U uDstOut;
12805 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12806 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12807 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12808 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12809 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12810 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12811 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12812 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12813 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12814 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12815 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12816 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12817 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12818 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12819 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12820 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12821
12822 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12823 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12824 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12825 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12826 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12827 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12828 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12829 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12830 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12831 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12832 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12833 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12834 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12835 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12836 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12837 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12838 *puDst = uDstOut;
12839}
12840
12841
12842/*
12843 * PACKUSWB - signed words -> unsigned bytes
12844 */
12845#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12846 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12847 ? (uint8_t)(a_iWord) \
12848 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12849
12850#ifdef IEM_WITHOUT_ASSEMBLY
12851
12852IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12853{
12854 RTUINT64U const uSrc2 = { *puSrc };
12855 RTUINT64U const uSrc1 = { *puDst };
12856 ASMCompilerBarrier();
12857 RTUINT64U uDstOut;
12858 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12859 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12860 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12861 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12862 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12863 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12864 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12865 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12866 *puDst = uDstOut.u;
12867}
12868
12869
12870IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12871{
12872 RTUINT128U const uSrc2 = *puSrc;
12873 RTUINT128U const uSrc1 = *puDst;
12874 ASMCompilerBarrier();
12875 RTUINT128U uDstOut;
12876 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12877 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12878 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12879 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12880 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12881 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12882 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12883 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12884 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12885 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12886 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12887 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12888 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12889 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12890 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12891 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12892 *puDst = uDstOut;
12893}
12894
12895#endif
12896
12897IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12898{
12899 RTUINT128U const uSrc2 = *puSrc2;
12900 RTUINT128U const uSrc1 = *puSrc1;
12901 ASMCompilerBarrier();
12902 RTUINT128U uDstOut;
12903 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12904 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12905 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12906 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12907 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12908 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12909 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12910 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12911 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12912 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12913 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12914 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12915 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12916 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12917 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12918 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12919 *puDst = uDstOut;
12920}
12921
12922
12923IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12924{
12925 RTUINT256U const uSrc2 = *puSrc2;
12926 RTUINT256U const uSrc1 = *puSrc1;
12927 ASMCompilerBarrier();
12928 RTUINT256U uDstOut;
12929 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12930 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12931 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12932 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12933 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12934 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12935 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12936 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12937 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12938 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12939 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12940 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12941 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12942 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12943 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12944 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12945
12946 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12947 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12948 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12949 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12950 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12951 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12952 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12953 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12954 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12955 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12956 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12957 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12958 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12959 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12960 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12961 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12962 *puDst = uDstOut;
12963}
12964
12965
12966/*
12967 * PACKSSDW - signed dwords -> signed words
12968 */
12969
12970#ifdef IEM_WITHOUT_ASSEMBLY
12971
12972IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12973{
12974 RTUINT64U const uSrc2 = { *puSrc };
12975 RTUINT64U const uSrc1 = { *puDst };
12976 ASMCompilerBarrier();
12977 RTUINT64U uDstOut;
12978 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12979 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12980 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12981 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12982 *puDst = uDstOut.u;
12983}
12984
12985
12986IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12987{
12988 RTUINT128U const uSrc2 = *puSrc;
12989 RTUINT128U const uSrc1 = *puDst;
12990 ASMCompilerBarrier();
12991 RTUINT128U uDstOut;
12992 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12993 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12994 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12995 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12996 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12997 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12998 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12999 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13000 *puDst = uDstOut;
13001}
13002
13003#endif
13004
13005IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13006{
13007 RTUINT128U const uSrc2 = *puSrc2;
13008 RTUINT128U const uSrc1 = *puSrc1;
13009 ASMCompilerBarrier();
13010 RTUINT128U uDstOut;
13011 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13012 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13013 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13014 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13015 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13016 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13017 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13018 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13019 *puDst = uDstOut;
13020}
13021
13022
13023IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13024{
13025 RTUINT256U const uSrc2 = *puSrc2;
13026 RTUINT256U const uSrc1 = *puSrc1;
13027 ASMCompilerBarrier();
13028 RTUINT256U uDstOut;
13029 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13030 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13031 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13032 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13033 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13034 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13035 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13036 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13037
13038 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13039 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13040 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13041 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13042 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13043 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13044 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13045 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13046 *puDst = uDstOut;
13047}
13048
13049
13050/*
13051 * PACKUSDW - signed dwords -> unsigned words
13052 */
13053#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13054 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13055 ? (uint16_t)(a_iDword) \
13056 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13057
13058#ifdef IEM_WITHOUT_ASSEMBLY
13059IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13060{
13061 RTUINT128U const uSrc2 = *puSrc;
13062 RTUINT128U const uSrc1 = *puDst;
13063 ASMCompilerBarrier();
13064 RTUINT128U uDstOut;
13065 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13066 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13067 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13068 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13069 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13070 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13071 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13072 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13073 *puDst = uDstOut;
13074}
13075#endif
13076
13077IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13078{
13079 RTUINT128U const uSrc2 = *puSrc2;
13080 RTUINT128U const uSrc1 = *puSrc1;
13081 ASMCompilerBarrier();
13082 RTUINT128U uDstOut;
13083 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13084 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13085 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13086 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13087 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13088 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13089 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13090 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13091 *puDst = uDstOut;
13092}
13093
13094
13095IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13096{
13097 RTUINT256U const uSrc2 = *puSrc2;
13098 RTUINT256U const uSrc1 = *puSrc1;
13099 ASMCompilerBarrier();
13100 RTUINT256U uDstOut;
13101 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13102 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13103 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13104 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13105 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13106 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13107 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13108 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13109
13110 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13111 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13112 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13113 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13114 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13115 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13116 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13117 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13118 *puDst = uDstOut;
13119}
13120
13121
13122/*
13123 * [V]PABSB / [V]PABSW / [V]PABSD
13124 */
13125
13126IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13127{
13128 RTUINT64U const uSrc = { *puSrc };
13129 RTUINT64U uDstOut = { 0 };
13130
13131 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13132 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13133 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13134 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13135 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13136 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13137 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13138 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13139 *puDst = uDstOut.u;
13140 RT_NOREF(pFpuState);
13141}
13142
13143
13144IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13145{
13146 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13147 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13148 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13149 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13150 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13151 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13152 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13153 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13154 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13155 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13156 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13157 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13158 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13159 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13160 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13161 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13162 RT_NOREF(pFpuState);
13163}
13164
13165
13166IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13167{
13168 RTUINT64U const uSrc = { *puSrc };
13169 RTUINT64U uDstOut = { 0 };
13170
13171 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13172 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13173 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13174 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13175 *puDst = uDstOut.u;
13176 RT_NOREF(pFpuState);
13177}
13178
13179
13180IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13181{
13182 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13183 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13184 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13185 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13186 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13187 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13188 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13189 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13190 RT_NOREF(pFpuState);
13191}
13192
13193
13194IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13195{
13196 RTUINT64U const uSrc = { *puSrc };
13197 RTUINT64U uDstOut = { 0 };
13198
13199 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13200 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13201 *puDst = uDstOut.u;
13202 RT_NOREF(pFpuState);
13203}
13204
13205
13206IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13207{
13208 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13209 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13210 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13211 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13212 RT_NOREF(pFpuState);
13213}
13214
13215
13216IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13217{
13218 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13219 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13220 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13221 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13222 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13223 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13224 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13225 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13226 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13227 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13228 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13229 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13230 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13231 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13232 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13233 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13234}
13235
13236
13237IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13238{
13239 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13240 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13241 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13242 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13243 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13244 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13245 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13246 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13247 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13248 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13249 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13250 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13251 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13252 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13253 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13254 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13255 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13256 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13257 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13258 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13259 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13260 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13261 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13262 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13263 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13264 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13265 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13266 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13267 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13268 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13269 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13270 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13271}
13272
13273
13274IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13275{
13276 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13277 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13278 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13279 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13280 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13281 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13282 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13283 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13284}
13285
13286
13287IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13288{
13289 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13290 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13291 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13292 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13293 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13294 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13295 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13296 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13297 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13298 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13299 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13300 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13301 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13302 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13303 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13304 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13305}
13306
13307
13308IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13309{
13310 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13311 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13312 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13313 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13314}
13315
13316
13317IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13318{
13319 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13320 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13321 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13322 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13323 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13324 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13325 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13326 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13327}
13328
13329
13330/*
13331 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13332 */
13333IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13334{
13335 RTUINT64U uSrc1 = { *puDst };
13336 RTUINT64U uSrc2 = { *puSrc };
13337 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13338
13339 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13340 {
13341 if (uSrc2.ai8[i] < 0)
13342 uDst.ai8[i] = -uSrc1.ai8[i];
13343 else if (uSrc2.ai8[i] == 0)
13344 uDst.ai8[i] = 0;
13345 else /* uSrc2.ai8[i] > 0 */
13346 uDst.ai8[i] = uSrc1.ai8[i];
13347 }
13348
13349 *puDst = uDst.u;
13350 RT_NOREF(pFpuState);
13351}
13352
13353
13354IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13355{
13356 RTUINT128U uSrc1 = *puDst;
13357
13358 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13359 {
13360 if (puSrc->ai8[i] < 0)
13361 puDst->ai8[i] = -uSrc1.ai8[i];
13362 else if (puSrc->ai8[i] == 0)
13363 puDst->ai8[i] = 0;
13364 else /* puSrc->ai8[i] > 0 */
13365 puDst->ai8[i] = uSrc1.ai8[i];
13366 }
13367
13368 RT_NOREF(pFpuState);
13369}
13370
13371
13372IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13373{
13374 RTUINT64U uSrc1 = { *puDst };
13375 RTUINT64U uSrc2 = { *puSrc };
13376 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13377
13378 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13379 {
13380 if (uSrc2.ai16[i] < 0)
13381 uDst.ai16[i] = -uSrc1.ai16[i];
13382 else if (uSrc2.ai16[i] == 0)
13383 uDst.ai16[i] = 0;
13384 else /* uSrc2.ai16[i] > 0 */
13385 uDst.ai16[i] = uSrc1.ai16[i];
13386 }
13387
13388 *puDst = uDst.u;
13389 RT_NOREF(pFpuState);
13390}
13391
13392
13393IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13394{
13395 RTUINT128U uSrc1 = *puDst;
13396
13397 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13398 {
13399 if (puSrc->ai16[i] < 0)
13400 puDst->ai16[i] = -uSrc1.ai16[i];
13401 else if (puSrc->ai16[i] == 0)
13402 puDst->ai16[i] = 0;
13403 else /* puSrc->ai16[i] > 0 */
13404 puDst->ai16[i] = uSrc1.ai16[i];
13405 }
13406
13407 RT_NOREF(pFpuState);
13408}
13409
13410
13411IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13412{
13413 RTUINT64U uSrc1 = { *puDst };
13414 RTUINT64U uSrc2 = { *puSrc };
13415 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13416
13417 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13418 {
13419 if (uSrc2.ai32[i] < 0)
13420 uDst.ai32[i] = -uSrc1.ai32[i];
13421 else if (uSrc2.ai32[i] == 0)
13422 uDst.ai32[i] = 0;
13423 else /* uSrc2.ai32[i] > 0 */
13424 uDst.ai32[i] = uSrc1.ai32[i];
13425 }
13426
13427 *puDst = uDst.u;
13428 RT_NOREF(pFpuState);
13429}
13430
13431
13432IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13433{
13434 RTUINT128U uSrc1 = *puDst;
13435
13436 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13437 {
13438 if (puSrc->ai32[i] < 0)
13439 puDst->ai32[i] = -uSrc1.ai32[i];
13440 else if (puSrc->ai32[i] == 0)
13441 puDst->ai32[i] = 0;
13442 else /* puSrc->ai32[i] > 0 */
13443 puDst->ai32[i] = uSrc1.ai32[i];
13444 }
13445
13446 RT_NOREF(pFpuState);
13447}
13448
13449
13450IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13451{
13452 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13453 {
13454 if (puSrc2->ai8[i] < 0)
13455 puDst->ai8[i] = -puSrc1->ai8[i];
13456 else if (puSrc2->ai8[i] == 0)
13457 puDst->ai8[i] = 0;
13458 else /* puSrc2->ai8[i] > 0 */
13459 puDst->ai8[i] = puSrc1->ai8[i];
13460 }
13461}
13462
13463
13464IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13465{
13466 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13467 {
13468 if (puSrc2->ai8[i] < 0)
13469 puDst->ai8[i] = -puSrc1->ai8[i];
13470 else if (puSrc2->ai8[i] == 0)
13471 puDst->ai8[i] = 0;
13472 else /* puSrc2->ai8[i] > 0 */
13473 puDst->ai8[i] = puSrc1->ai8[i];
13474 }
13475}
13476
13477
13478IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13479{
13480 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13481 {
13482 if (puSrc2->ai16[i] < 0)
13483 puDst->ai16[i] = -puSrc1->ai16[i];
13484 else if (puSrc2->ai16[i] == 0)
13485 puDst->ai16[i] = 0;
13486 else /* puSrc2->ai16[i] > 0 */
13487 puDst->ai16[i] = puSrc1->ai16[i];
13488 }
13489}
13490
13491
13492IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13493{
13494 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13495 {
13496 if (puSrc2->ai16[i] < 0)
13497 puDst->ai16[i] = -puSrc1->ai16[i];
13498 else if (puSrc2->ai16[i] == 0)
13499 puDst->ai16[i] = 0;
13500 else /* puSrc2->ai16[i] > 0 */
13501 puDst->ai16[i] = puSrc1->ai16[i];
13502 }
13503}
13504
13505
13506IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13507{
13508 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13509 {
13510 if (puSrc2->ai32[i] < 0)
13511 puDst->ai32[i] = -puSrc1->ai32[i];
13512 else if (puSrc2->ai32[i] == 0)
13513 puDst->ai32[i] = 0;
13514 else /* puSrc2->ai32[i] > 0 */
13515 puDst->ai32[i] = puSrc1->ai32[i];
13516 }
13517}
13518
13519
13520IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13521{
13522 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13523 {
13524 if (puSrc2->ai32[i] < 0)
13525 puDst->ai32[i] = -puSrc1->ai32[i];
13526 else if (puSrc2->ai32[i] == 0)
13527 puDst->ai32[i] = 0;
13528 else /* puSrc2->ai32[i] > 0 */
13529 puDst->ai32[i] = puSrc1->ai32[i];
13530 }
13531}
13532
13533
13534/*
13535 * PHADDW / VPHADDW / PHADDD / VPHADDD
13536 */
13537IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13538{
13539 RTUINT64U uSrc1 = { *puDst };
13540 RTUINT64U uSrc2 = { *puSrc };
13541 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13542
13543 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13544 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13545 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13546 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13547 *puDst = uDst.u;
13548 RT_NOREF(pFpuState);
13549}
13550
13551
13552IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13553{
13554 RTUINT128U uSrc1 = *puDst;
13555
13556 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13557 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13558 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13559 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13560
13561 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13562 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13563 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13564 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13565 RT_NOREF(pFpuState);
13566}
13567
13568
13569IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13570{
13571 RTUINT64U uSrc1 = { *puDst };
13572 RTUINT64U uSrc2 = { *puSrc };
13573 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13574
13575 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13576 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
13577 *puDst = uDst.u;
13578 RT_NOREF(pFpuState);
13579}
13580
13581
13582IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13583{
13584 RTUINT128U uSrc1 = *puDst;
13585
13586 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13587 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13588
13589 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13590 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13591 RT_NOREF(pFpuState);
13592}
13593
13594
13595IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13596{
13597 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13598
13599 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13600 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13601 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13602 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13603
13604 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13605 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13606 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13607 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13608
13609 puDst->au64[0] = uDst.au64[0];
13610 puDst->au64[1] = uDst.au64[1];
13611}
13612
13613
13614IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13615{
13616 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13617
13618 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13619 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13620 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13621 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13622 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13623 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13624 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13625 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13626
13627 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13628 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13629 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13630 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13631 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13632 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13633 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13634 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13635
13636 puDst->au64[0] = uDst.au64[0];
13637 puDst->au64[1] = uDst.au64[1];
13638 puDst->au64[2] = uDst.au64[2];
13639 puDst->au64[3] = uDst.au64[3];
13640}
13641
13642
13643IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13644{
13645 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13646
13647 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13648 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13649
13650 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13651 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13652
13653 puDst->au64[0] = uDst.au64[0];
13654 puDst->au64[1] = uDst.au64[1];
13655}
13656
13657
13658IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13659{
13660 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13661
13662 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13663 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13664 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13665 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13666
13667 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13668 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13669 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13670 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13671
13672 puDst->au64[0] = uDst.au64[0];
13673 puDst->au64[1] = uDst.au64[1];
13674 puDst->au64[2] = uDst.au64[2];
13675 puDst->au64[3] = uDst.au64[3];
13676}
13677
13678
13679/*
13680 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13681 */
13682IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13683{
13684 RTUINT64U uSrc1 = { *puDst };
13685 RTUINT64U uSrc2 = { *puSrc };
13686 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13687
13688 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13689 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13690 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13691 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13692 *puDst = uDst.u;
13693 RT_NOREF(pFpuState);
13694}
13695
13696
13697IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13698{
13699 RTUINT128U uSrc1 = *puDst;
13700
13701 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13702 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13703 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13704 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13705
13706 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13707 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13708 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13709 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13710 RT_NOREF(pFpuState);
13711}
13712
13713
13714IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13715{
13716 RTUINT64U uSrc1 = { *puDst };
13717 RTUINT64U uSrc2 = { *puSrc };
13718 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13719
13720 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13721 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13722 *puDst = uDst.u;
13723 RT_NOREF(pFpuState);
13724}
13725
13726
13727IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13728{
13729 RTUINT128U uSrc1 = *puDst;
13730
13731 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13732 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13733
13734 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13735 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13736 RT_NOREF(pFpuState);
13737}
13738
13739
13740IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13741{
13742 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13743
13744 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13745 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13746 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13747 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13748
13749 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13750 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13751 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13752 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13753
13754 puDst->au64[0] = uDst.au64[0];
13755 puDst->au64[1] = uDst.au64[1];
13756}
13757
13758
13759IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13760{
13761 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13762
13763 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13764 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13765 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13766 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13767 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13768 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13769 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13770 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13771
13772 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13773 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13774 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13775 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13776 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13777 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13778 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13779 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13780
13781 puDst->au64[0] = uDst.au64[0];
13782 puDst->au64[1] = uDst.au64[1];
13783 puDst->au64[2] = uDst.au64[2];
13784 puDst->au64[3] = uDst.au64[3];
13785}
13786
13787
13788IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13789{
13790 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13791
13792 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13793 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13794
13795 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13796 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13797
13798 puDst->au64[0] = uDst.au64[0];
13799 puDst->au64[1] = uDst.au64[1];
13800}
13801
13802
13803IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13804{
13805 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13806
13807 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13808 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13809 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13810 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13811
13812 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13813 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13814 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13815 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13816
13817 puDst->au64[0] = uDst.au64[0];
13818 puDst->au64[1] = uDst.au64[1];
13819 puDst->au64[2] = uDst.au64[2];
13820 puDst->au64[3] = uDst.au64[3];
13821}
13822
13823
13824/*
13825 * PHADDSW / VPHADDSW
13826 */
13827IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13828{
13829 RTUINT64U uSrc1 = { *puDst };
13830 RTUINT64U uSrc2 = { *puSrc };
13831 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13832
13833 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13834 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13835 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13836 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13837 *puDst = uDst.u;
13838 RT_NOREF(pFpuState);
13839}
13840
13841
13842IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13843{
13844 RTUINT128U uSrc1 = *puDst;
13845
13846 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13847 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13848 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13849 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13850
13851 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13852 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13853 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13854 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13855 RT_NOREF(pFpuState);
13856}
13857
13858
13859IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13860{
13861 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13862
13863 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13864 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13865 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13866 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13867
13868 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13869 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13870 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13871 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13872
13873 puDst->au64[0] = uDst.au64[0];
13874 puDst->au64[1] = uDst.au64[1];
13875}
13876
13877
13878IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13879{
13880 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13881
13882 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13883 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13884 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13885 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13886 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13887 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13888 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13889 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13890
13891 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13892 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13893 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13894 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13895 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13896 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13897 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13898 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13899
13900 puDst->au64[0] = uDst.au64[0];
13901 puDst->au64[1] = uDst.au64[1];
13902 puDst->au64[2] = uDst.au64[2];
13903 puDst->au64[3] = uDst.au64[3];
13904}
13905
13906
13907/*
13908 * PHSUBSW / VPHSUBSW
13909 */
13910IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13911{
13912 RTUINT64U uSrc1 = { *puDst };
13913 RTUINT64U uSrc2 = { *puSrc };
13914 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13915
13916 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13917 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13918 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13919 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13920 *puDst = uDst.u;
13921 RT_NOREF(pFpuState);
13922}
13923
13924
13925IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13926{
13927 RTUINT128U uSrc1 = *puDst;
13928
13929 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13930 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13931 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13932 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13933
13934 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13935 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13936 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13937 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13938 RT_NOREF(pFpuState);
13939}
13940
13941
13942IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13943{
13944 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13945
13946 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13947 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13948 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13949 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13950
13951 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13952 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13953 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13954 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13955
13956 puDst->au64[0] = uDst.au64[0];
13957 puDst->au64[1] = uDst.au64[1];
13958}
13959
13960
13961IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13962{
13963 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13964
13965 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13966 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13967 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13968 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13969 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13970 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13971 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13972 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13973
13974 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13975 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13976 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13977 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13978 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13979 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13980 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13981 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13982
13983 puDst->au64[0] = uDst.au64[0];
13984 puDst->au64[1] = uDst.au64[1];
13985 puDst->au64[2] = uDst.au64[2];
13986 puDst->au64[3] = uDst.au64[3];
13987}
13988
13989
13990/*
13991 * PMADDUBSW / VPMADDUBSW
13992 */
13993IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13994{
13995 RTUINT64U uSrc1 = { *puDst };
13996 RTUINT64U uSrc2 = { *puSrc };
13997 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13998
13999 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14000 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14001 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14002 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14003 *puDst = uDst.u;
14004 RT_NOREF(pFpuState);
14005}
14006
14007
14008IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14009{
14010 RTUINT128U uSrc1 = *puDst;
14011
14012 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14013 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14014 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14015 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14016 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14017 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14018 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14019 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14020 RT_NOREF(pFpuState);
14021}
14022
14023
14024IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14025{
14026 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14027
14028 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14029 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14030 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14031 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14032 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14033 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14034 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14035 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14036
14037 puDst->au64[0] = uDst.au64[0];
14038 puDst->au64[1] = uDst.au64[1];
14039}
14040
14041
14042IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14043{
14044 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14045
14046 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14047 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14048 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14049 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14050 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14051 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14052 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14053 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14054 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14055 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14056 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14057 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14058 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14059 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14060 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14061 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14062
14063 puDst->au64[0] = uDst.au64[0];
14064 puDst->au64[1] = uDst.au64[1];
14065 puDst->au64[2] = uDst.au64[2];
14066 puDst->au64[3] = uDst.au64[3];
14067}
14068
14069
14070/*
14071 * PMULHRSW / VPMULHRSW
14072 */
14073#define DO_PMULHRSW(a_Src1, a_Src2) \
14074 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14075
14076IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14077{
14078 RTUINT64U uSrc1 = { *puDst };
14079 RTUINT64U uSrc2 = { *puSrc };
14080 RTUINT64U uDst;
14081
14082 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14083 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14084 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14085 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14086 *puDst = uDst.u;
14087 RT_NOREF(pFpuState);
14088}
14089
14090
14091IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14092{
14093 RTUINT128U uSrc1 = *puDst;
14094
14095 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14096 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14097 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14098 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14099 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14100 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14101 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14102 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14103 RT_NOREF(pFpuState);
14104}
14105
14106
14107IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14108{
14109 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14110
14111 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14112 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14113 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14114 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14115 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14116 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14117 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14118 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14119
14120 puDst->au64[0] = uDst.au64[0];
14121 puDst->au64[1] = uDst.au64[1];
14122}
14123
14124
14125IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14126{
14127 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14128
14129 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14130 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14131 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14132 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14133 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14134 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14135 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14136 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14137 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14138 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14139 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14140 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14141 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14142 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14143 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14144 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14145
14146 puDst->au64[0] = uDst.au64[0];
14147 puDst->au64[1] = uDst.au64[1];
14148 puDst->au64[2] = uDst.au64[2];
14149 puDst->au64[3] = uDst.au64[3];
14150}
14151
14152
14153/*
14154 * PSADBW / VPSADBW
14155 */
14156#ifdef IEM_WITHOUT_ASSEMBLY
14157
14158IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14159{
14160 RTUINT64U uSrc1 = { *puDst };
14161 RTUINT64U uSrc2 = { *puSrc };
14162 RTUINT64U uDst;
14163 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14164 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14165 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14166 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14167 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14168 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14169 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14170 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14171
14172 uDst.au64[0] = 0;
14173 uDst.au16[0] = uSum;
14174 *puDst = uDst.u;
14175}
14176
14177
14178IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14179{
14180 RTUINT128U uSrc1 = *puDst;
14181
14182 puDst->au64[0] = 0;
14183 puDst->au64[1] = 0;
14184
14185 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14186 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14187 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14188 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14189 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14190 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14191 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14192 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14193 puDst->au16[0] = uSum;
14194
14195 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14196 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14197 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14198 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14199 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14200 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14201 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14202 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14203 puDst->au16[4] = uSum;
14204}
14205
14206#endif
14207
14208IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14209{
14210 RTUINT128U uSrc1 = *puSrc1;
14211 RTUINT128U uSrc2 = *puSrc2;
14212
14213 puDst->au64[0] = 0;
14214 puDst->au64[1] = 0;
14215
14216 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14217 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14218 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14219 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14220 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14221 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14222 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14223 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14224 puDst->au16[0] = uSum;
14225
14226 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14227 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14228 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14229 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14230 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14231 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14232 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14233 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14234 puDst->au16[4] = uSum;
14235}
14236
14237IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14238{
14239 RTUINT256U uSrc1 = *puSrc1;
14240 RTUINT256U uSrc2 = *puSrc2;
14241
14242 puDst->au64[0] = 0;
14243 puDst->au64[1] = 0;
14244 puDst->au64[2] = 0;
14245 puDst->au64[3] = 0;
14246
14247 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14248 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14249 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14250 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14251 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14252 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14253 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14254 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14255 puDst->au16[0] = uSum;
14256
14257 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14258 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14259 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14260 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14261 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14262 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14263 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14264 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14265 puDst->au16[4] = uSum;
14266
14267 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14268 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14269 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14270 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14271 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14272 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14273 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14274 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14275 puDst->au16[8] = uSum;
14276
14277 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14278 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14279 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14280 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14281 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14282 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14283 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14284 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14285 puDst->au16[12] = uSum;
14286}
14287
14288
14289/*
14290 * PMULDQ / VPMULDQ
14291 */
14292IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14293{
14294 RTUINT128U uSrc1 = *puDst;
14295
14296 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14297 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14298}
14299
14300IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14301{
14302 RTUINT128U uSrc1 = *puSrc1;
14303 RTUINT128U uSrc2 = *puSrc2;
14304
14305 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14306 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14307}
14308
14309IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14310{
14311 RTUINT256U uSrc1 = *puSrc1;
14312 RTUINT256U uSrc2 = *puSrc2;
14313
14314 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14315 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14316 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14317 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14318}
14319
14320
14321/*
14322 * PMULUDQ / VPMULUDQ
14323 */
14324#ifdef IEM_WITHOUT_ASSEMBLY
14325
14326IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14327{
14328 RTUINT64U uSrc1 = { *puDst };
14329 RTUINT64U uSrc2 = { *puSrc };
14330 ASMCompilerBarrier();
14331 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14332 RT_NOREF(pFpuState);
14333}
14334
14335
14336IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14337{
14338 RTUINT128U uSrc1 = *puDst;
14339 RTUINT128U uSrc2 = *puSrc;
14340 ASMCompilerBarrier();
14341 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14342 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14343 RT_NOREF(pFpuState);
14344}
14345
14346#endif
14347
14348IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14349{
14350 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14351 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14352 ASMCompilerBarrier();
14353 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14354 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14355}
14356
14357
14358IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14359{
14360 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14361 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14362 ASMCompilerBarrier();
14363 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14364 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14365 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14366 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14367}
14368
14369
14370/*
14371 * UNPCKLPS / VUNPCKLPS
14372 */
14373#ifdef IEM_WITHOUT_ASSEMBLY
14374IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14375{
14376 RTUINT128U uSrc1 = *puDst;
14377 RTUINT128U uSrc2 = *puSrc;
14378 ASMCompilerBarrier();
14379 puDst->au32[0] = uSrc1.au32[0];
14380 puDst->au32[1] = uSrc2.au32[0];
14381 puDst->au32[2] = uSrc1.au32[1];
14382 puDst->au32[3] = uSrc2.au32[1];
14383}
14384
14385#endif
14386
14387IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14388{
14389 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14390 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14391 ASMCompilerBarrier();
14392 puDst->au32[0] = uSrc1.au32[0];
14393 puDst->au32[1] = uSrc2.au32[0];
14394 puDst->au32[2] = uSrc1.au32[1];
14395 puDst->au32[3] = uSrc2.au32[1];
14396}
14397
14398
14399IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14400{
14401 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14402 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14403 ASMCompilerBarrier();
14404 puDst->au32[0] = uSrc1.au32[0];
14405 puDst->au32[1] = uSrc2.au32[0];
14406 puDst->au32[2] = uSrc1.au32[1];
14407 puDst->au32[3] = uSrc2.au32[1];
14408
14409 puDst->au32[4] = uSrc1.au32[4];
14410 puDst->au32[5] = uSrc2.au32[4];
14411 puDst->au32[6] = uSrc1.au32[5];
14412 puDst->au32[7] = uSrc2.au32[5];
14413}
14414
14415
14416/*
14417 * UNPCKLPD / VUNPCKLPD
14418 */
14419#ifdef IEM_WITHOUT_ASSEMBLY
14420IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14421{
14422 RTUINT128U uSrc1 = *puDst;
14423 RTUINT128U uSrc2 = *puSrc;
14424 ASMCompilerBarrier();
14425 puDst->au64[0] = uSrc1.au64[0];
14426 puDst->au64[1] = uSrc2.au64[0];
14427}
14428
14429#endif
14430
14431IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14432{
14433 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14434 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14435 ASMCompilerBarrier();
14436 puDst->au64[0] = uSrc1.au64[0];
14437 puDst->au64[1] = uSrc2.au64[0];
14438}
14439
14440
14441IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14442{
14443 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14444 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14445 ASMCompilerBarrier();
14446 puDst->au64[0] = uSrc1.au64[0];
14447 puDst->au64[1] = uSrc2.au64[0];
14448 puDst->au64[2] = uSrc1.au64[2];
14449 puDst->au64[3] = uSrc2.au64[2];
14450}
14451
14452
14453/*
14454 * UNPCKHPS / VUNPCKHPS
14455 */
14456#ifdef IEM_WITHOUT_ASSEMBLY
14457IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14458{
14459 RTUINT128U uSrc1 = *puDst;
14460 RTUINT128U uSrc2 = *puSrc;
14461 ASMCompilerBarrier();
14462 puDst->au32[0] = uSrc1.au32[2];
14463 puDst->au32[1] = uSrc2.au32[2];
14464 puDst->au32[2] = uSrc1.au32[3];
14465 puDst->au32[3] = uSrc2.au32[3];
14466}
14467
14468#endif
14469
14470IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14471{
14472 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14473 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14474 ASMCompilerBarrier();
14475 puDst->au32[0] = uSrc1.au32[2];
14476 puDst->au32[1] = uSrc2.au32[2];
14477 puDst->au32[2] = uSrc1.au32[3];
14478 puDst->au32[3] = uSrc2.au32[3];
14479}
14480
14481
14482IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14483{
14484 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14485 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14486 ASMCompilerBarrier();
14487 puDst->au32[0] = uSrc1.au32[2];
14488 puDst->au32[1] = uSrc2.au32[2];
14489 puDst->au32[2] = uSrc1.au32[3];
14490 puDst->au32[3] = uSrc2.au32[3];
14491
14492 puDst->au32[4] = uSrc1.au32[6];
14493 puDst->au32[5] = uSrc2.au32[6];
14494 puDst->au32[6] = uSrc1.au32[7];
14495 puDst->au32[7] = uSrc2.au32[7];
14496}
14497
14498
14499/*
14500 * UNPCKHPD / VUNPCKHPD
14501 */
14502#ifdef IEM_WITHOUT_ASSEMBLY
14503IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14504{
14505 RTUINT128U uSrc1 = *puDst;
14506 RTUINT128U uSrc2 = *puSrc;
14507 ASMCompilerBarrier();
14508 puDst->au64[0] = uSrc1.au64[1];
14509 puDst->au64[1] = uSrc2.au64[1];
14510}
14511
14512#endif
14513
14514IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14515{
14516 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14517 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14518 ASMCompilerBarrier();
14519 puDst->au64[0] = uSrc1.au64[1];
14520 puDst->au64[1] = uSrc2.au64[1];
14521}
14522
14523
14524IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14525{
14526 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14527 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14528 ASMCompilerBarrier();
14529 puDst->au64[0] = uSrc1.au64[1];
14530 puDst->au64[1] = uSrc2.au64[1];
14531 puDst->au64[2] = uSrc1.au64[3];
14532 puDst->au64[3] = uSrc2.au64[3];
14533}
14534
14535
14536/*
14537 * CRC32 (SEE 4.2).
14538 */
14539
14540IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14541{
14542 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14543}
14544
14545
14546IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14547{
14548 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14549}
14550
14551IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14552{
14553 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14554}
14555
14556IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14557{
14558 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14559}
14560
14561
14562/*
14563 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14564 */
14565#ifdef IEM_WITHOUT_ASSEMBLY
14566IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14567{
14568 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14569 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14570 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14571 fEfl |= X86_EFL_ZF;
14572 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14573 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14574 fEfl |= X86_EFL_CF;
14575 *pfEFlags = fEfl;
14576}
14577#endif
14578
14579IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14580{
14581 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14582 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14583 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14584 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14585 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14586 fEfl |= X86_EFL_ZF;
14587 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14588 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14589 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14590 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14591 fEfl |= X86_EFL_CF;
14592 *pfEFlags = fEfl;
14593}
14594
14595
14596/*
14597 * PMOVSXBW / VPMOVSXBW
14598 */
14599IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14600{
14601 RTUINT64U uSrc1 = { uSrc };
14602 puDst->ai16[0] = uSrc1.ai8[0];
14603 puDst->ai16[1] = uSrc1.ai8[1];
14604 puDst->ai16[2] = uSrc1.ai8[2];
14605 puDst->ai16[3] = uSrc1.ai8[3];
14606 puDst->ai16[4] = uSrc1.ai8[4];
14607 puDst->ai16[5] = uSrc1.ai8[5];
14608 puDst->ai16[6] = uSrc1.ai8[6];
14609 puDst->ai16[7] = uSrc1.ai8[7];
14610}
14611
14612
14613IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14614{
14615 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14616 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14617 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14618 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14619 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14620 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14621 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14622 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14623 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14624 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14625 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14626 puDst->ai16[10] = uSrc1.ai8[10];
14627 puDst->ai16[11] = uSrc1.ai8[11];
14628 puDst->ai16[12] = uSrc1.ai8[12];
14629 puDst->ai16[13] = uSrc1.ai8[13];
14630 puDst->ai16[14] = uSrc1.ai8[14];
14631 puDst->ai16[15] = uSrc1.ai8[15];
14632}
14633
14634
14635/*
14636 * PMOVSXBD / VPMOVSXBD
14637 */
14638IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14639{
14640 RTUINT32U uSrc1 = { uSrc };
14641 puDst->ai32[0] = uSrc1.ai8[0];
14642 puDst->ai32[1] = uSrc1.ai8[1];
14643 puDst->ai32[2] = uSrc1.ai8[2];
14644 puDst->ai32[3] = uSrc1.ai8[3];
14645}
14646
14647
14648IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14649{
14650 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14651 puDst->ai32[0] = uSrc1.ai8[0];
14652 puDst->ai32[1] = uSrc1.ai8[1];
14653 puDst->ai32[2] = uSrc1.ai8[2];
14654 puDst->ai32[3] = uSrc1.ai8[3];
14655 puDst->ai32[4] = uSrc1.ai8[4];
14656 puDst->ai32[5] = uSrc1.ai8[5];
14657 puDst->ai32[6] = uSrc1.ai8[6];
14658 puDst->ai32[7] = uSrc1.ai8[7];
14659}
14660
14661
14662/*
14663 * PMOVSXBQ / VPMOVSXBQ
14664 */
14665IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14666{
14667 RTUINT16U uSrc1 = { uSrc };
14668 puDst->ai64[0] = uSrc1.ai8[0];
14669 puDst->ai64[1] = uSrc1.ai8[1];
14670}
14671
14672
14673IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14674{
14675 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14676 puDst->ai64[0] = uSrc1.ai8[0];
14677 puDst->ai64[1] = uSrc1.ai8[1];
14678 puDst->ai64[2] = uSrc1.ai8[2];
14679 puDst->ai64[3] = uSrc1.ai8[3];
14680}
14681
14682
14683/*
14684 * PMOVSXWD / VPMOVSXWD
14685 */
14686IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14687{
14688 RTUINT64U uSrc1 = { uSrc };
14689 puDst->ai32[0] = uSrc1.ai16[0];
14690 puDst->ai32[1] = uSrc1.ai16[1];
14691 puDst->ai32[2] = uSrc1.ai16[2];
14692 puDst->ai32[3] = uSrc1.ai16[3];
14693}
14694
14695
14696IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14697{
14698 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14699 puDst->ai32[0] = uSrc1.ai16[0];
14700 puDst->ai32[1] = uSrc1.ai16[1];
14701 puDst->ai32[2] = uSrc1.ai16[2];
14702 puDst->ai32[3] = uSrc1.ai16[3];
14703 puDst->ai32[4] = uSrc1.ai16[4];
14704 puDst->ai32[5] = uSrc1.ai16[5];
14705 puDst->ai32[6] = uSrc1.ai16[6];
14706 puDst->ai32[7] = uSrc1.ai16[7];
14707}
14708
14709
14710/*
14711 * PMOVSXWQ / VPMOVSXWQ
14712 */
14713IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14714{
14715 RTUINT32U uSrc1 = { uSrc };
14716 puDst->ai64[0] = uSrc1.ai16[0];
14717 puDst->ai64[1] = uSrc1.ai16[1];
14718}
14719
14720
14721IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14722{
14723 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14724 puDst->ai64[0] = uSrc1.ai16[0];
14725 puDst->ai64[1] = uSrc1.ai16[1];
14726 puDst->ai64[2] = uSrc1.ai16[2];
14727 puDst->ai64[3] = uSrc1.ai16[3];
14728}
14729
14730
14731/*
14732 * PMOVSXDQ / VPMOVSXDQ
14733 */
14734IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14735{
14736 RTUINT64U uSrc1 = { uSrc };
14737 puDst->ai64[0] = uSrc1.ai32[0];
14738 puDst->ai64[1] = uSrc1.ai32[1];
14739}
14740
14741
14742IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14743{
14744 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14745 puDst->ai64[0] = uSrc1.ai32[0];
14746 puDst->ai64[1] = uSrc1.ai32[1];
14747 puDst->ai64[2] = uSrc1.ai32[2];
14748 puDst->ai64[3] = uSrc1.ai32[3];
14749}
14750
14751
14752/*
14753 * PMOVZXBW / VPMOVZXBW
14754 */
14755IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14756{
14757 RTUINT64U uSrc1 = { uSrc };
14758 puDst->au16[0] = uSrc1.au8[0];
14759 puDst->au16[1] = uSrc1.au8[1];
14760 puDst->au16[2] = uSrc1.au8[2];
14761 puDst->au16[3] = uSrc1.au8[3];
14762 puDst->au16[4] = uSrc1.au8[4];
14763 puDst->au16[5] = uSrc1.au8[5];
14764 puDst->au16[6] = uSrc1.au8[6];
14765 puDst->au16[7] = uSrc1.au8[7];
14766}
14767
14768
14769IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14770{
14771 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14772 puDst->au16[ 0] = uSrc1.au8[ 0];
14773 puDst->au16[ 1] = uSrc1.au8[ 1];
14774 puDst->au16[ 2] = uSrc1.au8[ 2];
14775 puDst->au16[ 3] = uSrc1.au8[ 3];
14776 puDst->au16[ 4] = uSrc1.au8[ 4];
14777 puDst->au16[ 5] = uSrc1.au8[ 5];
14778 puDst->au16[ 6] = uSrc1.au8[ 6];
14779 puDst->au16[ 7] = uSrc1.au8[ 7];
14780 puDst->au16[ 8] = uSrc1.au8[ 8];
14781 puDst->au16[ 9] = uSrc1.au8[ 9];
14782 puDst->au16[10] = uSrc1.au8[10];
14783 puDst->au16[11] = uSrc1.au8[11];
14784 puDst->au16[12] = uSrc1.au8[12];
14785 puDst->au16[13] = uSrc1.au8[13];
14786 puDst->au16[14] = uSrc1.au8[14];
14787 puDst->au16[15] = uSrc1.au8[15];
14788}
14789
14790
14791/*
14792 * PMOVZXBD / VPMOVZXBD
14793 */
14794IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14795{
14796 RTUINT32U uSrc1 = { uSrc };
14797 puDst->au32[0] = uSrc1.au8[0];
14798 puDst->au32[1] = uSrc1.au8[1];
14799 puDst->au32[2] = uSrc1.au8[2];
14800 puDst->au32[3] = uSrc1.au8[3];
14801}
14802
14803
14804IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14805{
14806 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14807 puDst->au32[0] = uSrc1.au8[0];
14808 puDst->au32[1] = uSrc1.au8[1];
14809 puDst->au32[2] = uSrc1.au8[2];
14810 puDst->au32[3] = uSrc1.au8[3];
14811 puDst->au32[4] = uSrc1.au8[4];
14812 puDst->au32[5] = uSrc1.au8[5];
14813 puDst->au32[6] = uSrc1.au8[6];
14814 puDst->au32[7] = uSrc1.au8[7];
14815}
14816
14817
14818/*
14819 * PMOVZXBQ / VPMOVZXBQ
14820 */
14821IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14822{
14823 RTUINT16U uSrc1 = { uSrc };
14824 puDst->au64[0] = uSrc1.au8[0];
14825 puDst->au64[1] = uSrc1.au8[1];
14826}
14827
14828
14829IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14830{
14831 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14832 puDst->au64[0] = uSrc1.au8[0];
14833 puDst->au64[1] = uSrc1.au8[1];
14834 puDst->au64[2] = uSrc1.au8[2];
14835 puDst->au64[3] = uSrc1.au8[3];
14836}
14837
14838
14839/*
14840 * PMOVZXWD / VPMOVZXWD
14841 */
14842IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14843{
14844 RTUINT64U uSrc1 = { uSrc };
14845 puDst->au32[0] = uSrc1.au16[0];
14846 puDst->au32[1] = uSrc1.au16[1];
14847 puDst->au32[2] = uSrc1.au16[2];
14848 puDst->au32[3] = uSrc1.au16[3];
14849}
14850
14851
14852IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14853{
14854 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14855 puDst->au32[0] = uSrc1.au16[0];
14856 puDst->au32[1] = uSrc1.au16[1];
14857 puDst->au32[2] = uSrc1.au16[2];
14858 puDst->au32[3] = uSrc1.au16[3];
14859 puDst->au32[4] = uSrc1.au16[4];
14860 puDst->au32[5] = uSrc1.au16[5];
14861 puDst->au32[6] = uSrc1.au16[6];
14862 puDst->au32[7] = uSrc1.au16[7];
14863}
14864
14865
14866/*
14867 * PMOVZXWQ / VPMOVZXWQ
14868 */
14869IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14870{
14871 RTUINT32U uSrc1 = { uSrc };
14872 puDst->au64[0] = uSrc1.au16[0];
14873 puDst->au64[1] = uSrc1.au16[1];
14874}
14875
14876
14877IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14878{
14879 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14880 puDst->au64[0] = uSrc1.au16[0];
14881 puDst->au64[1] = uSrc1.au16[1];
14882 puDst->au64[2] = uSrc1.au16[2];
14883 puDst->au64[3] = uSrc1.au16[3];
14884}
14885
14886
14887/*
14888 * PMOVZXDQ / VPMOVZXDQ
14889 */
14890IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14891{
14892 RTUINT64U uSrc1 = { uSrc };
14893 puDst->au64[0] = uSrc1.au32[0];
14894 puDst->au64[1] = uSrc1.au32[1];
14895}
14896
14897
14898IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14899{
14900 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14901 puDst->au64[0] = uSrc1.au32[0];
14902 puDst->au64[1] = uSrc1.au32[1];
14903 puDst->au64[2] = uSrc1.au32[2];
14904 puDst->au64[3] = uSrc1.au32[3];
14905}
14906
14907/**
14908 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14909 * the SoftFloat 32-bit floating point format (float32_t).
14910 *
14911 * This is only a structure format conversion, nothing else.
14912 */
14913DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14914{
14915 float32_t Tmp;
14916 Tmp.v = pr32Val->u;
14917 return Tmp;
14918}
14919
14920
14921/**
14922 * Converts from SoftFloat 32-bit floating point format (float32_t)
14923 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14924 *
14925 * This is only a structure format conversion, nothing else.
14926 */
14927DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14928{
14929 pr32Dst->u = r32XSrc.v;
14930 return pr32Dst;
14931}
14932
14933
14934/**
14935 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14936 * the SoftFloat 64-bit floating point format (float64_t).
14937 *
14938 * This is only a structure format conversion, nothing else.
14939 */
14940DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14941{
14942 float64_t Tmp;
14943 Tmp.v = pr64Val->u;
14944 return Tmp;
14945}
14946
14947
14948/**
14949 * Converts from SoftFloat 64-bit floating point format (float64_t)
14950 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14951 *
14952 * This is only a structure format conversion, nothing else.
14953 */
14954DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14955{
14956 pr64Dst->u = r64XSrc.v;
14957 return pr64Dst;
14958}
14959
14960
14961/** Initializer for the SoftFloat state structure. */
14962# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14963 { \
14964 softfloat_tininess_afterRounding, \
14965 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14966 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14967 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14968 : (uint8_t)softfloat_round_minMag, \
14969 0, \
14970 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14971 32 /* Rounding precision, not relevant for SIMD. */ \
14972 }
14973
14974#ifdef IEM_WITHOUT_ASSEMBLY
14975
14976/**
14977 * Helper for transfering exception to MXCSR and setting the result value
14978 * accordingly.
14979 *
14980 * @returns Updated MXCSR.
14981 * @param pSoftState The SoftFloat state following the operation.
14982 * @param r32Result The result of the SoftFloat operation.
14983 * @param pr32Result Where to store the result for IEM.
14984 * @param fMxcsr The original MXCSR value.
14985 */
14986DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14987 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14988{
14989 iemFpSoftF32ToIprt(pr32Result, r32Result);
14990
14991 uint8_t fXcpt = pSoftState->exceptionFlags;
14992 if ( (fMxcsr & X86_MXCSR_FZ)
14993 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14994 {
14995 /* Underflow masked and flush to zero is set. */
14996 pr32Result->s.uFraction = 0;
14997 pr32Result->s.uExponent = 0;
14998 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14999 }
15000
15001 /* If DAZ is set \#DE is never set. */
15002 if ( fMxcsr & X86_MXCSR_DAZ
15003 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15004 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15005 fXcpt &= ~X86_MXCSR_DE;
15006
15007 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15008}
15009
15010
15011/**
15012 * Helper for transfering exception to MXCSR and setting the result value
15013 * accordingly - ignores Flush-to-Zero.
15014 *
15015 * @returns Updated MXCSR.
15016 * @param pSoftState The SoftFloat state following the operation.
15017 * @param r32Result The result of the SoftFloat operation.
15018 * @param pr32Result Where to store the result for IEM.
15019 * @param fMxcsr The original MXCSR value.
15020 */
15021DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15022 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15023{
15024 iemFpSoftF32ToIprt(pr32Result, r32Result);
15025
15026 uint8_t fXcpt = pSoftState->exceptionFlags;
15027 /* If DAZ is set \#DE is never set. */
15028 if ( fMxcsr & X86_MXCSR_DAZ
15029 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15030 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15031 fXcpt &= ~X86_MXCSR_DE;
15032
15033 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15034}
15035
15036
15037/**
15038 * Helper for transfering exception to MXCSR and setting the result value
15039 * accordingly.
15040 *
15041 * @returns Updated MXCSR.
15042 * @param pSoftState The SoftFloat state following the operation.
15043 * @param r64Result The result of the SoftFloat operation.
15044 * @param pr64Result Where to store the result for IEM.
15045 * @param fMxcsr The original MXCSR value.
15046 */
15047DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15048 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15049{
15050 iemFpSoftF64ToIprt(pr64Result, r64Result);
15051 uint8_t fXcpt = pSoftState->exceptionFlags;
15052 if ( (fMxcsr & X86_MXCSR_FZ)
15053 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15054 {
15055 /* Underflow masked and flush to zero is set. */
15056 iemFpSoftF64ToIprt(pr64Result, r64Result);
15057 pr64Result->s.uFractionHigh = 0;
15058 pr64Result->s.uFractionLow = 0;
15059 pr64Result->s.uExponent = 0;
15060 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15061 }
15062
15063 /* If DAZ is set \#DE is never set. */
15064 if ( fMxcsr & X86_MXCSR_DAZ
15065 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15066 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15067 fXcpt &= ~X86_MXCSR_DE;
15068
15069 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15070}
15071
15072
15073/**
15074 * Helper for transfering exception to MXCSR and setting the result value
15075 * accordingly - ignores Flush-to-Zero.
15076 *
15077 * @returns Updated MXCSR.
15078 * @param pSoftState The SoftFloat state following the operation.
15079 * @param r64Result The result of the SoftFloat operation.
15080 * @param pr64Result Where to store the result for IEM.
15081 * @param fMxcsr The original MXCSR value.
15082 */
15083DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15084 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15085{
15086 iemFpSoftF64ToIprt(pr64Result, r64Result);
15087
15088 uint8_t fXcpt = pSoftState->exceptionFlags;
15089 /* If DAZ is set \#DE is never set. */
15090 if ( fMxcsr & X86_MXCSR_DAZ
15091 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15092 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15093 fXcpt &= ~X86_MXCSR_DE;
15094
15095 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15096}
15097
15098#endif /* IEM_WITHOUT_ASSEMBLY */
15099
15100
15101/**
15102 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15103 * in MXCSR into account.
15104 *
15105 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15106 * @param pr32Val Where to store the result.
15107 * @param fMxcsr The input MXCSR value.
15108 * @param pr32Src The value to use.
15109 */
15110DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15111{
15112 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15113 {
15114 if (fMxcsr & X86_MXCSR_DAZ)
15115 {
15116 /* De-normals are changed to 0. */
15117 pr32Val->s.fSign = pr32Src->s.fSign;
15118 pr32Val->s.uFraction = 0;
15119 pr32Val->s.uExponent = 0;
15120 return 0;
15121 }
15122
15123 *pr32Val = *pr32Src;
15124 return X86_MXCSR_DE;
15125 }
15126
15127 *pr32Val = *pr32Src;
15128 return 0;
15129}
15130
15131
15132/**
15133 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15134 * in MXCSR into account.
15135 *
15136 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15137 * @param pr64Val Where to store the result.
15138 * @param fMxcsr The input MXCSR value.
15139 * @param pr64Src The value to use.
15140 */
15141DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15142{
15143 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15144 {
15145 if (fMxcsr & X86_MXCSR_DAZ)
15146 {
15147 /* De-normals are changed to 0. */
15148 pr64Val->s64.fSign = pr64Src->s.fSign;
15149 pr64Val->s64.uFraction = 0;
15150 pr64Val->s64.uExponent = 0;
15151 return 0;
15152 }
15153
15154 *pr64Val = *pr64Src;
15155 return X86_MXCSR_DE;
15156 }
15157
15158 *pr64Val = *pr64Src;
15159 return 0;
15160}
15161
15162#ifdef IEM_WITHOUT_ASSEMBLY
15163
15164/**
15165 * Validates the given input operands returning whether the operation can continue or whether one
15166 * of the source operands contains a NaN value, setting the output accordingly.
15167 *
15168 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15169 * @param pr32Res Where to store the result in case the operation can't continue.
15170 * @param pr32Val1 The first input operand.
15171 * @param pr32Val2 The second input operand.
15172 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15173 */
15174DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15175{
15176 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15177 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15178 if (cSNan + cQNan == 2)
15179 {
15180 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15181 *pr32Res = *pr32Val1;
15182 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15183 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15184 return true;
15185 }
15186 if (cSNan)
15187 {
15188 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15189 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15190 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15191 *pfMxcsr |= X86_MXCSR_IE;
15192 return true;
15193 }
15194 if (cQNan)
15195 {
15196 /* The QNan operand is placed into the result. */
15197 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15198 return true;
15199 }
15200
15201 Assert(!cQNan && !cSNan);
15202 return false;
15203}
15204
15205
15206/**
15207 * Validates the given double precision input operands returning whether the operation can continue or whether one
15208 * of the source operands contains a NaN value, setting the output accordingly.
15209 *
15210 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15211 * @param pr64Res Where to store the result in case the operation can't continue.
15212 * @param pr64Val1 The first input operand.
15213 * @param pr64Val2 The second input operand.
15214 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15215 */
15216DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15217{
15218 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15219 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15220 if (cSNan + cQNan == 2)
15221 {
15222 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15223 *pr64Res = *pr64Val1;
15224 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15225 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15226 return true;
15227 }
15228 if (cSNan)
15229 {
15230 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15231 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15232 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15233 *pfMxcsr |= X86_MXCSR_IE;
15234 return true;
15235 }
15236 if (cQNan)
15237 {
15238 /* The QNan operand is placed into the result. */
15239 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15240 return true;
15241 }
15242
15243 Assert(!cQNan && !cSNan);
15244 return false;
15245}
15246
15247
15248/**
15249 * Validates the given single input operand returning whether the operation can continue or whether
15250 * contains a NaN value, setting the output accordingly.
15251 *
15252 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15253 * @param pr32Res Where to store the result in case the operation can't continue.
15254 * @param pr32Val The input operand.
15255 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15256 */
15257DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15258{
15259 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15260 {
15261 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15262 *pr32Res = *pr32Val;
15263 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15264 *pfMxcsr |= X86_MXCSR_IE;
15265 return true;
15266 }
15267 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15268 {
15269 /* The QNan operand is placed into the result. */
15270 *pr32Res = *pr32Val;
15271 return true;
15272 }
15273
15274 return false;
15275}
15276
15277
15278/**
15279 * Validates the given double input operand returning whether the operation can continue or whether
15280 * contains a NaN value, setting the output accordingly.
15281 *
15282 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15283 * @param pr64Res Where to store the result in case the operation can't continue.
15284 * @param pr64Val The input operand.
15285 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15286 */
15287DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15288{
15289 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15290 {
15291 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15292 *pr64Res = *pr64Val;
15293 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15294 *pfMxcsr |= X86_MXCSR_IE;
15295 return true;
15296 }
15297 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15298 {
15299 /* The QNan operand is placed into the result. */
15300 *pr64Res = *pr64Val;
15301 return true;
15302 }
15303
15304 return false;
15305}
15306
15307#endif /* IEM_WITHOUT_ASSEMBLY */
15308
15309/**
15310 * ADDPS
15311 */
15312#ifdef IEM_WITHOUT_ASSEMBLY
15313static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15314{
15315 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15316 return fMxcsr;
15317
15318 RTFLOAT32U r32Src1, r32Src2;
15319 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15320 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15321 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15322 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15323 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15324}
15325
15326
15327IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15328{
15329 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15330 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15331 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15332 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15333}
15334#endif
15335
15336
15337/**
15338 * ADDSS
15339 */
15340#ifdef IEM_WITHOUT_ASSEMBLY
15341IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15342{
15343 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15344 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15345 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15346 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15347}
15348#endif
15349
15350
15351/**
15352 * ADDPD
15353 */
15354#ifdef IEM_WITHOUT_ASSEMBLY
15355static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15356{
15357 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15358 return fMxcsr;
15359
15360 RTFLOAT64U r64Src1, r64Src2;
15361 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15362 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15363 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15364 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15365 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15366}
15367
15368
15369IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15370{
15371 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15372 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15373}
15374#endif
15375
15376
15377/**
15378 * ADDSD
15379 */
15380#ifdef IEM_WITHOUT_ASSEMBLY
15381IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15382{
15383 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15384 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15385}
15386#endif
15387
15388
15389/**
15390 * MULPS
15391 */
15392#ifdef IEM_WITHOUT_ASSEMBLY
15393static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15394{
15395 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15396 return fMxcsr;
15397
15398 RTFLOAT32U r32Src1, r32Src2;
15399 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15400 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15401 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15402 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15403 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15404}
15405
15406
15407IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15408{
15409 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15410 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15411 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15412 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15413}
15414#endif
15415
15416
15417/**
15418 * MULSS
15419 */
15420#ifdef IEM_WITHOUT_ASSEMBLY
15421IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15422{
15423 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15424 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15425 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15426 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15427}
15428#endif
15429
15430
15431/**
15432 * MULPD
15433 */
15434#ifdef IEM_WITHOUT_ASSEMBLY
15435static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15436{
15437 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15438 return fMxcsr;
15439
15440 RTFLOAT64U r64Src1, r64Src2;
15441 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15442 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15443 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15444 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15445 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15446}
15447
15448
15449IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15450{
15451 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15452 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15453}
15454#endif
15455
15456
15457/**
15458 * MULSD
15459 */
15460#ifdef IEM_WITHOUT_ASSEMBLY
15461IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15462{
15463 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15464 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15465}
15466#endif
15467
15468
15469/**
15470 * SUBPS
15471 */
15472#ifdef IEM_WITHOUT_ASSEMBLY
15473static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15474{
15475 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15476 return fMxcsr;
15477
15478 RTFLOAT32U r32Src1, r32Src2;
15479 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15480 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15481 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15482 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15483 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15484}
15485
15486
15487IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15488{
15489 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15490 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15491 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15492 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15493}
15494#endif
15495
15496
15497/**
15498 * SUBSS
15499 */
15500#ifdef IEM_WITHOUT_ASSEMBLY
15501IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15502{
15503 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15504 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15505 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15506 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15507}
15508#endif
15509
15510
15511/**
15512 * SUBPD
15513 */
15514#ifdef IEM_WITHOUT_ASSEMBLY
15515static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15516{
15517 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15518 return fMxcsr;
15519
15520 RTFLOAT64U r64Src1, r64Src2;
15521 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15522 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15523 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15524 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15525 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15526}
15527
15528
15529IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15530{
15531 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15532 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15533}
15534#endif
15535
15536
15537/**
15538 * SUBSD
15539 */
15540#ifdef IEM_WITHOUT_ASSEMBLY
15541IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15542{
15543 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15544 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15545}
15546#endif
15547
15548
15549/**
15550 * MINPS
15551 */
15552#ifdef IEM_WITHOUT_ASSEMBLY
15553static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15554{
15555 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15556 {
15557 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15558 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15559 return fMxcsr | X86_MXCSR_IE;
15560 }
15561
15562 RTFLOAT32U r32Src1, r32Src2;
15563 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15564 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15565 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15566 {
15567 *pr32Res = r32Src2;
15568 return fMxcsr;
15569 }
15570
15571 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15572 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15573 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15574 fLe
15575 ? iemFpSoftF32FromIprt(&r32Src1)
15576 : iemFpSoftF32FromIprt(&r32Src2),
15577 pr32Res, fMxcsr);
15578}
15579
15580
15581IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15582{
15583 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15584 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15585 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15586 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15587}
15588#endif
15589
15590
15591/**
15592 * MINSS
15593 */
15594#ifdef IEM_WITHOUT_ASSEMBLY
15595IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15596{
15597 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15598 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15599 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15600 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15601}
15602#endif
15603
15604
15605/**
15606 * MINPD
15607 */
15608#ifdef IEM_WITHOUT_ASSEMBLY
15609static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15610{
15611 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15612 {
15613 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15614 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15615 return fMxcsr | X86_MXCSR_IE;
15616 }
15617
15618 RTFLOAT64U r64Src1, r64Src2;
15619 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15620 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15621 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15622 {
15623 *pr64Res = r64Src2;
15624 return fMxcsr;
15625 }
15626
15627 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15628 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15629 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15630 fLe
15631 ? iemFpSoftF64FromIprt(&r64Src1)
15632 : iemFpSoftF64FromIprt(&r64Src2),
15633 pr64Res, fMxcsr);
15634}
15635
15636
15637IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15638{
15639 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15640 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15641}
15642#endif
15643
15644
15645/**
15646 * MINSD
15647 */
15648#ifdef IEM_WITHOUT_ASSEMBLY
15649IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15650{
15651 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15652 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15653}
15654#endif
15655
15656
15657/**
15658 * DIVPS
15659 */
15660#ifdef IEM_WITHOUT_ASSEMBLY
15661static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15662{
15663 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15664 return fMxcsr;
15665
15666 RTFLOAT32U r32Src1, r32Src2;
15667 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15668 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15669 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15670 {
15671 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15672 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15673 {
15674 *pr32Res = g_ar32QNaN[1];
15675 return fMxcsr | X86_MXCSR_IE;
15676 }
15677 else if (RTFLOAT32U_IS_INF(&r32Src1))
15678 {
15679 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15680 return fMxcsr;
15681 }
15682 else
15683 {
15684 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15685 return fMxcsr | X86_MXCSR_ZE;
15686 }
15687 }
15688
15689 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15690 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15691 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15692}
15693
15694
15695IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15696{
15697 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15698 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15699 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15700 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15701}
15702#endif
15703
15704
15705/**
15706 * DIVSS
15707 */
15708#ifdef IEM_WITHOUT_ASSEMBLY
15709IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15710{
15711 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15712 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15713 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15714 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15715}
15716#endif
15717
15718
15719/**
15720 * DIVPD
15721 */
15722#ifdef IEM_WITHOUT_ASSEMBLY
15723static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15724{
15725 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15726 return fMxcsr;
15727
15728 RTFLOAT64U r64Src1, r64Src2;
15729 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15730 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15731 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15732 {
15733 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15734 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15735 {
15736 *pr64Res = g_ar64QNaN[1];
15737 return fMxcsr | X86_MXCSR_IE;
15738 }
15739 else if (RTFLOAT64U_IS_INF(&r64Src1))
15740 {
15741 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15742 return fMxcsr;
15743 }
15744 else
15745 {
15746 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15747 return fMxcsr | X86_MXCSR_ZE;
15748 }
15749 }
15750
15751 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15752 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15753 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15754}
15755
15756
15757IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15758{
15759 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15760 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15761}
15762#endif
15763
15764
15765/**
15766 * DIVSD
15767 */
15768#ifdef IEM_WITHOUT_ASSEMBLY
15769IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15770{
15771 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15772 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15773}
15774#endif
15775
15776
15777/**
15778 * MAXPS
15779 */
15780#ifdef IEM_WITHOUT_ASSEMBLY
15781static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15782{
15783 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15784 {
15785 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15786 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15787 return fMxcsr | X86_MXCSR_IE;
15788 }
15789
15790 RTFLOAT32U r32Src1, r32Src2;
15791 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15792 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15793 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15794 {
15795 *pr32Res = r32Src2;
15796 return fMxcsr;
15797 }
15798
15799 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15800 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15801 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15802 fLe
15803 ? iemFpSoftF32FromIprt(&r32Src2)
15804 : iemFpSoftF32FromIprt(&r32Src1),
15805 pr32Res, fMxcsr);
15806}
15807
15808
15809IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15810{
15811 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15812 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15813 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15814 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15815}
15816#endif
15817
15818
15819/**
15820 * MAXSS
15821 */
15822#ifdef IEM_WITHOUT_ASSEMBLY
15823IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15824{
15825 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15826 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15827 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15828 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15829}
15830#endif
15831
15832
15833/**
15834 * MAXPD
15835 */
15836#ifdef IEM_WITHOUT_ASSEMBLY
15837static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15838{
15839 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15840 {
15841 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15842 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15843 return fMxcsr | X86_MXCSR_IE;
15844 }
15845
15846 RTFLOAT64U r64Src1, r64Src2;
15847 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15848 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15849 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15850 {
15851 *pr64Res = r64Src2;
15852 return fMxcsr;
15853 }
15854
15855 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15856 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15857 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15858 fLe
15859 ? iemFpSoftF64FromIprt(&r64Src2)
15860 : iemFpSoftF64FromIprt(&r64Src1),
15861 pr64Res, fMxcsr);
15862}
15863
15864
15865IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15866{
15867 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15868 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15869}
15870#endif
15871
15872
15873/**
15874 * MAXSD
15875 */
15876#ifdef IEM_WITHOUT_ASSEMBLY
15877IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15878{
15879 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15880 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15881}
15882#endif
15883
15884
15885/**
15886 * CVTSS2SD
15887 */
15888#ifdef IEM_WITHOUT_ASSEMBLY
15889static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15890{
15891 RTFLOAT32U r32Src1;
15892 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15893
15894 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15895 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15896 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15897}
15898
15899
15900IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15901{
15902 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15903 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15904}
15905#endif
15906
15907
15908/**
15909 * CVTSD2SS
15910 */
15911#ifdef IEM_WITHOUT_ASSEMBLY
15912static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15913{
15914 RTFLOAT64U r64Src1;
15915 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15916
15917 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15918 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15919 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15920}
15921
15922
15923IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15924{
15925 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15926 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15927 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15928 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15929}
15930#endif
15931
15932
15933/**
15934 * HADDPS
15935 */
15936#ifdef IEM_WITHOUT_ASSEMBLY
15937IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15938{
15939 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15940 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15941 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15942 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15943}
15944#endif
15945
15946
15947/**
15948 * HADDPD
15949 */
15950#ifdef IEM_WITHOUT_ASSEMBLY
15951IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15952{
15953 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15954 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15955}
15956#endif
15957
15958
15959/**
15960 * HSUBPS
15961 */
15962#ifdef IEM_WITHOUT_ASSEMBLY
15963IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15964{
15965 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15966 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15967 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15968 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15969}
15970#endif
15971
15972
15973/**
15974 * HSUBPD
15975 */
15976#ifdef IEM_WITHOUT_ASSEMBLY
15977IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15978{
15979 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15980 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15981}
15982#endif
15983
15984
15985/**
15986 * SQRTPS
15987 */
15988#ifdef IEM_WITHOUT_ASSEMBLY
15989static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15990{
15991 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15992 return fMxcsr;
15993
15994 RTFLOAT32U r32Src;
15995 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15996 if (RTFLOAT32U_IS_ZERO(&r32Src))
15997 {
15998 *pr32Res = r32Src;
15999 return fMxcsr;
16000 }
16001 else if (r32Src.s.fSign)
16002 {
16003 *pr32Res = g_ar32QNaN[1];
16004 return fMxcsr | X86_MXCSR_IE;
16005 }
16006
16007 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16008 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16009 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16010}
16011
16012
16013IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16014{
16015 RT_NOREF(puSrc1);
16016
16017 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16018 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16019 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16020 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16021}
16022#endif
16023
16024
16025/**
16026 * SQRTSS
16027 */
16028#ifdef IEM_WITHOUT_ASSEMBLY
16029IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16030{
16031 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16032 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16033 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16034 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16035}
16036#endif
16037
16038
16039/**
16040 * SQRTPD
16041 */
16042#ifdef IEM_WITHOUT_ASSEMBLY
16043static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16044{
16045 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16046 return fMxcsr;
16047
16048 RTFLOAT64U r64Src;
16049 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16050 if (RTFLOAT64U_IS_ZERO(&r64Src))
16051 {
16052 *pr64Res = r64Src;
16053 return fMxcsr;
16054 }
16055 else if (r64Src.s.fSign)
16056 {
16057 *pr64Res = g_ar64QNaN[1];
16058 return fMxcsr | X86_MXCSR_IE;
16059 }
16060
16061 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16062 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16063 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16064}
16065
16066
16067IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16068{
16069 RT_NOREF(puSrc1);
16070
16071 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16072 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16073}
16074#endif
16075
16076
16077/**
16078 * SQRTSD
16079 */
16080#ifdef IEM_WITHOUT_ASSEMBLY
16081IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16082{
16083 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
16084 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16085}
16086#endif
16087
16088
16089#ifdef IEM_WITHOUT_ASSEMBLY
16090/**
16091 * RSQRTPS
16092 */
16093static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16094{
16095 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16096 return fMxcsr;
16097
16098 RTFLOAT32U r32Src;
16099 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16100 if (RTFLOAT32U_IS_ZERO(&r32Src))
16101 {
16102 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16103 return fMxcsr;
16104 }
16105 else if (r32Src.s.fSign)
16106 {
16107 *pr32Res = g_ar32QNaN[1];
16108 return fMxcsr | X86_MXCSR_IE;
16109 }
16110
16111 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16112 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16113 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16114}
16115
16116
16117IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16118{
16119 RT_NOREF(puSrc1);
16120
16121 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16122 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16123 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16124 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16125}
16126
16127
16128/**
16129 * RSQRTSS
16130 */
16131IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16132{
16133 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16134 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16135 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16136 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16137}
16138#endif
16139
16140
16141/**
16142 * RCPPS
16143 */
16144#ifdef IEM_WITHOUT_ASSEMBLY
16145static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16146{
16147 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16148 return fMxcsr;
16149
16150 RTFLOAT32U r32Src;
16151 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16152 if (RTFLOAT32U_IS_ZERO(&r32Src))
16153 {
16154 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16155 return fMxcsr;
16156 }
16157
16158 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16159 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16160 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16161}
16162
16163
16164IEM_DECL_IMPL_DEF(void, iemAImpl_rcpps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16165{
16166 RT_NOREF(puSrc1);
16167
16168 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16169 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16170 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16171 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16172}
16173
16174
16175/**
16176 * RCPSS
16177 */
16178IEM_DECL_IMPL_DEF(void, iemAImpl_rcpss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16179{
16180 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16181 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16182 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16183 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16184}
16185#endif
16186
16187
16188/**
16189 * ADDSUBPS
16190 */
16191#ifdef IEM_WITHOUT_ASSEMBLY
16192IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16193{
16194 RT_NOREF(puSrc1);
16195
16196 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16197 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16198 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16199 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16200}
16201#endif
16202
16203
16204/**
16205 * ADDSUBPD
16206 */
16207#ifdef IEM_WITHOUT_ASSEMBLY
16208IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16209{
16210 RT_NOREF(puSrc1);
16211
16212 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16213 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16214}
16215#endif
16216
16217
16218/**
16219 * CVTPD2PS
16220 */
16221#ifdef IEM_WITHOUT_ASSEMBLY
16222static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16223{
16224 RTFLOAT64U r64Src1;
16225 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16226
16227 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16228 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16229 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16230}
16231
16232
16233IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16234{
16235 RT_NOREF(puSrc1);
16236
16237 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16238 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16239 pResult->uResult.au32[2] = 0;
16240 pResult->uResult.au32[3] = 0;
16241}
16242#endif
16243
16244
16245/**
16246 * CVTPS2PD
16247 */
16248#ifdef IEM_WITHOUT_ASSEMBLY
16249static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16250{
16251 RTFLOAT32U r32Src1;
16252 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16253
16254 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16255 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16256 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16257}
16258
16259
16260IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16261{
16262 RT_NOREF(puSrc1);
16263
16264 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16265 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16266}
16267#endif
16268
16269
16270/**
16271 * CVTDQ2PS
16272 */
16273#ifdef IEM_WITHOUT_ASSEMBLY
16274static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16275{
16276 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16277 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16278 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16279}
16280
16281
16282IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16283{
16284 RT_NOREF(puSrc1);
16285
16286 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16287 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16288 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
16289 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
16290}
16291#endif
16292
16293
16294/**
16295 * CVTPS2DQ
16296 */
16297#ifdef IEM_WITHOUT_ASSEMBLY
16298static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16299{
16300 RTFLOAT32U r32Src;
16301 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16302
16303 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16304 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16305 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16306}
16307
16308
16309IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16310{
16311 RT_NOREF(puSrc1);
16312
16313 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16314 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16315 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16316 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16317}
16318#endif
16319
16320
16321/**
16322 * CVTTPS2DQ
16323 */
16324#ifdef IEM_WITHOUT_ASSEMBLY
16325static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16326{
16327 RTFLOAT32U r32Src;
16328 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16329
16330 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16331 SoftState.roundingMode = softfloat_round_minMag;
16332 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16333 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16334}
16335
16336
16337IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16338{
16339 RT_NOREF(puSrc1);
16340
16341 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16342 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16343 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16344 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16345}
16346#endif
16347
16348
16349/**
16350 * CVTTPD2DQ
16351 */
16352#ifdef IEM_WITHOUT_ASSEMBLY
16353static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16354{
16355 RTFLOAT64U r64Src;
16356 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16357
16358 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16359 SoftState.roundingMode = softfloat_round_minMag;
16360 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16361 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16362}
16363
16364
16365IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16366{
16367 RT_NOREF(puSrc1);
16368
16369 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16370 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16371 pResult->uResult.au64[1] = 0;
16372}
16373#endif
16374
16375
16376/**
16377 * CVTDQ2PD
16378 */
16379#ifdef IEM_WITHOUT_ASSEMBLY
16380static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16381{
16382 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16383 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16384 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16385}
16386
16387
16388IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16389{
16390 RT_NOREF(puSrc1);
16391
16392 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16393 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16394}
16395#endif
16396
16397
16398/**
16399 * CVTPD2DQ
16400 */
16401#ifdef IEM_WITHOUT_ASSEMBLY
16402static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16403{
16404 RTFLOAT64U r64Src;
16405 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16406
16407 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16408 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16409 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16410}
16411
16412
16413IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16414{
16415 RT_NOREF(puSrc1);
16416
16417 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16418 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16419 pResult->uResult.au64[1] = 0;
16420}
16421#endif
16422
16423
16424/**
16425 * [V]SHUFPS
16426 */
16427#ifdef IEM_WITHOUT_ASSEMBLY
16428IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16429{
16430 RTUINT128U const uSrc1 = *puDst;
16431 RTUINT128U const uSrc2 = *puSrc;
16432 ASMCompilerBarrier();
16433 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16434 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16435 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16436 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16437}
16438#endif
16439
16440
16441IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16442{
16443 RTUINT128U const uSrc1 = *puSrc1;
16444 RTUINT128U const uSrc2 = *puSrc2;
16445 ASMCompilerBarrier();
16446 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16447 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16448 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16449 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16450}
16451
16452
16453IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16454{
16455 RTUINT256U const uSrc1 = *puSrc1;
16456 RTUINT256U const uSrc2 = *puSrc2;
16457 ASMCompilerBarrier();
16458 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16459 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16460 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16461 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16462
16463 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
16464 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
16465 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
16466 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
16467}
16468
16469
16470/**
16471 * [V]SHUFPD
16472 */
16473#ifdef IEM_WITHOUT_ASSEMBLY
16474IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16475{
16476 RTUINT128U const uSrc1 = *puDst;
16477 RTUINT128U const uSrc2 = *puSrc;
16478 ASMCompilerBarrier();
16479 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16480 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16481}
16482#endif
16483
16484
16485IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16486{
16487 RTUINT128U const uSrc1 = *puSrc1;
16488 RTUINT128U const uSrc2 = *puSrc2;
16489 ASMCompilerBarrier();
16490 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16491 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16492}
16493
16494
16495IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16496{
16497 RTUINT256U const uSrc1 = *puSrc1;
16498 RTUINT256U const uSrc2 = *puSrc2;
16499 ASMCompilerBarrier();
16500 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16501 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16502 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
16503 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
16504}
16505
16506
16507/*
16508 * PHMINPOSUW / VPHMINPOSUW
16509 */
16510IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16511{
16512 uint16_t u16Min = puSrc->au16[0];
16513 uint8_t idxMin = 0;
16514
16515 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
16516 if (puSrc->au16[i] < u16Min)
16517 {
16518 u16Min = puSrc->au16[i];
16519 idxMin = i;
16520 }
16521
16522 puDst->au64[0] = 0;
16523 puDst->au64[1] = 0;
16524 puDst->au16[0] = u16Min;
16525 puDst->au16[1] = idxMin;
16526}
16527
16528
16529IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16530{
16531 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
16532}
16533
16534
16535/*
16536 * [V]PBLENDVB
16537 */
16538IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16539{
16540 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16541 if (puMask->au8[i] & RT_BIT(7))
16542 puDst->au8[i] = puSrc->au8[i];
16543}
16544
16545
16546IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16547{
16548 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16549 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16550}
16551
16552
16553IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16554{
16555 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16556 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16557}
16558
16559
16560/*
16561 * [V]BLENDVPS
16562 */
16563IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16564{
16565 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16566 if (puMask->au32[i] & RT_BIT_32(31))
16567 puDst->au32[i] = puSrc->au32[i];
16568}
16569
16570
16571IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16572{
16573 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16574 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16575}
16576
16577
16578IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16579{
16580 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16581 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16582}
16583
16584
16585/*
16586 * [V]BLENDVPD
16587 */
16588IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16589{
16590 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
16591 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
16592}
16593
16594
16595IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16596{
16597 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16598 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16599}
16600
16601
16602IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16603{
16604 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16605 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16606}
16607
16608
16609/**
16610 * [V]PALIGNR
16611 */
16612IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
16613{
16614 uint64_t const u64Src1 = *pu64Dst;
16615 ASMCompilerBarrier();
16616
16617 if (bEvil >= 16)
16618 *pu64Dst = 0;
16619 else if (bEvil >= 8)
16620 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
16621 else
16622 {
16623 uint8_t cShift = bEvil * 8;
16624 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
16625 | (u64Src2 >> cShift);
16626 }
16627}
16628
16629
16630IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16631{
16632 RTUINT128U const uSrc1 = *puDst;
16633 RTUINT128U const uSrc2 = *puSrc;
16634 ASMCompilerBarrier();
16635
16636 puDst->au64[0] = 0;
16637 puDst->au64[1] = 0;
16638 if (bEvil >= 32)
16639 { /* Everything stays 0. */ }
16640 else if (bEvil >= 16)
16641 {
16642 bEvil -= 16;
16643 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16644 puDst->au8[i - bEvil] = uSrc1.au8[i];
16645 }
16646 else
16647 {
16648 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16649 puDst->au8[i] = uSrc2.au8[i + bEvil];
16650 for (uint8_t i = 0; i < bEvil; i++)
16651 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16652 }
16653}
16654
16655
16656IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16657{
16658 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16659 RTUINT128U const uSrc2 = *puSrc2;
16660 ASMCompilerBarrier();
16661
16662 puDst->au64[0] = 0;
16663 puDst->au64[1] = 0;
16664 if (bEvil >= 32)
16665 { /* Everything stays 0. */ }
16666 else if (bEvil >= 16)
16667 {
16668 bEvil -= 16;
16669 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16670 puDst->au8[i - bEvil] = uSrc1.au8[i];
16671 }
16672 else
16673 {
16674 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16675 puDst->au8[i] = uSrc2.au8[i + bEvil];
16676 for (uint8_t i = 0; i < bEvil; i++)
16677 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16678 }
16679}
16680
16681
16682IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16683{
16684 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16685 RTUINT256U const uSrc2 = *puSrc2;
16686 ASMCompilerBarrier();
16687
16688 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16689 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16690}
16691
16692
16693/**
16694 * [V]PBLENDW
16695 */
16696IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16697{
16698 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16699 if (bEvil & RT_BIT(i))
16700 puDst->au16[i] = puSrc->au16[i];
16701}
16702
16703
16704IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16705{
16706 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16707 if (bEvil & RT_BIT(i))
16708 puDst->au16[i] = puSrc2->au16[i];
16709 else
16710 puDst->au16[i] = puSrc1->au16[i];
16711}
16712
16713
16714IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16715{
16716 for (uint8_t i = 0; i < 8; i++)
16717 if (bEvil & RT_BIT(i))
16718 {
16719 puDst->au16[ i] = puSrc2->au16[ i];
16720 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16721 }
16722 else
16723 {
16724 puDst->au16[ i] = puSrc1->au16[ i];
16725 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16726 }
16727}
16728
16729
16730/**
16731 * [V]BLENDPS
16732 */
16733IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16734{
16735 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16736 if (bEvil & RT_BIT(i))
16737 puDst->au32[i] = puSrc->au32[i];
16738}
16739
16740
16741IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16742{
16743 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16744 if (bEvil & RT_BIT(i))
16745 puDst->au32[i] = puSrc2->au32[i];
16746 else
16747 puDst->au32[i] = puSrc1->au32[i];
16748}
16749
16750
16751IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16752{
16753 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16754 if (bEvil & RT_BIT(i))
16755 puDst->au32[i] = puSrc2->au32[i];
16756 else
16757 puDst->au32[i] = puSrc1->au32[i];
16758}
16759
16760
16761/**
16762 * [V]BLENDPD
16763 */
16764IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16765{
16766 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16767 if (bEvil & RT_BIT(i))
16768 puDst->au64[i] = puSrc->au64[i];
16769}
16770
16771
16772IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16773{
16774 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16775 if (bEvil & RT_BIT(i))
16776 puDst->au64[i] = puSrc2->au64[i];
16777 else
16778 puDst->au64[i] = puSrc1->au64[i];
16779}
16780
16781
16782IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16783{
16784 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16785 if (bEvil & RT_BIT(i))
16786 puDst->au64[i] = puSrc2->au64[i];
16787 else
16788 puDst->au64[i] = puSrc1->au64[i];
16789}
16790
16791
16792/**
16793 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16794 */
16795
16796static uint8_t iemAImpl_aes_sbox[] = {
16797 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16798 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16799 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16800 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16801 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16802 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16803 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16804 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16805 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16806 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
16807 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16808 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
16809 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
16810 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
16811 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
16812 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
16813};
16814
16815/* The InvS-Box lookup table. */
16816static uint8_t iemAImpl_aes_inv_sbox[] = {
16817 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
16818 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
16819 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
16820 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
16821 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
16822 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
16823 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
16824 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
16825 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
16826 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
16827 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
16828 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
16829 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
16830 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
16831 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
16832 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
16833};
16834
16835/* The ShiftRows lookup table. */
16836static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
16837 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
16838};
16839
16840/* The InvShiftRows lookup table. */
16841static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
16842 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
16843};
16844
16845static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
16846{
16847 RTUINT128U uVal;
16848 int i;
16849
16850 for (i = 0; i < 16; ++i)
16851 uVal.au8[i] = abSubst[puSrc->au8[i]];
16852
16853 return uVal;
16854}
16855
16856static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
16857{
16858 return (u << 1) ^ (((u >> 7) & 1) * 27);
16859}
16860
16861static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
16862{
16863 RTUINT128U uVal;
16864 int i;
16865 uint8_t tmp;
16866
16867 for (i = 0; i < 16; i += 4) {
16868 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
16869 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
16870 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
16871 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
16872 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
16873 }
16874
16875 return uVal;
16876}
16877
16878static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
16879{
16880 RTUINT128U uVal;
16881 int i;
16882
16883 for (i = 0; i < 16; ++i)
16884 uVal.au8[i] = puSrc->au8[abShift[i]];
16885
16886 return uVal;
16887}
16888
16889static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
16890{
16891 uint8_t val;
16892
16893 val = ((b >> 0) & 1) * a;
16894 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
16895 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
16896 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
16897 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
16898
16899 return val;
16900}
16901
16902static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
16903{
16904 RTUINT128U uVal;
16905 int i;
16906
16907 for (i = 0; i < 16; i += 4) {
16908 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
16909 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
16910 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
16911 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
16912 }
16913
16914 return uVal;
16915}
16916
16917static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
16918{
16919 RTUINT32U uTmp;
16920
16921 uTmp.au32[0] = w;
16922 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
16923 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
16924 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
16925 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
16926
16927 return uTmp.au32[0];
16928}
16929
16930static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
16931{
16932 return (w << 24) | (w >> 8);
16933}
16934
16935/**
16936 * [V]AESKEYGENASSIST
16937 */
16938IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
16939{
16940 RTUINT128U uTmp;
16941 uint32_t uRCon = bImm; /* Round constant. */
16942
16943 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
16944 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
16945 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
16946 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
16947
16948 *puDst = uTmp;
16949}
16950
16951
16952/**
16953 * [V]AESIMC
16954 */
16955IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16956{
16957 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
16958}
16959
16960
16961/**
16962 * [V]AESENC
16963 */
16964IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16965{
16966 RTUINT128U uTmp;
16967
16968 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16969 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16970 uTmp = iemAImpl_aes_mix_col(&uTmp);
16971 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16972 uTmp.au64[1] ^= puSrc->au64[1];
16973
16974 *puDst = uTmp;
16975}
16976
16977
16978/**
16979 * [V]AESENCLAST
16980 */
16981IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16982{
16983 RTUINT128U uTmp;
16984
16985 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16986 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16987 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16988 uTmp.au64[1] ^= puSrc->au64[1];
16989
16990 *puDst = uTmp;
16991}
16992
16993
16994/**
16995 * [V]AESDEC
16996 */
16997IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16998{
16999 RTUINT128U uTmp;
17000
17001 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17002 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17003 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17004 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17005 uTmp.au64[1] ^= puSrc->au64[1];
17006
17007 *puDst = uTmp;
17008}
17009
17010
17011/**
17012 * [V]AESDECLAST
17013 */
17014IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17015{
17016 RTUINT128U uTmp;
17017
17018 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17019 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17020 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17021 uTmp.au64[1] ^= puSrc->au64[1];
17022
17023 *puDst = uTmp;
17024}
17025
17026
17027/**
17028 * [V]PCMPISTRI
17029 */
17030
17031/**
17032 * Does the comparisons based on the mode and source input format.
17033 */
17034static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17035{
17036#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17037 do \
17038 { \
17039 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17040 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17041 { \
17042 switch (a_bAggOp) \
17043 { \
17044 case 0: \
17045 case 2: \
17046 case 3: \
17047 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17048 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17049 break; \
17050 case 1: \
17051 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17052 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17053 break; \
17054 default: \
17055 AssertReleaseFailed(); \
17056 } \
17057 } \
17058 } while(0)
17059
17060 uint8_t bAggOp = (bImm >> 2) & 0x3;
17061 switch (bImm & 0x3)
17062 {
17063 case 0:
17064 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17065 break;
17066 case 1:
17067 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17068 break;
17069 case 2:
17070 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17071 break;
17072 case 3:
17073 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17074 break;
17075 default:
17076 AssertReleaseFailed();
17077 }
17078#undef PCMPXSTRX_CMP_CASE
17079}
17080
17081static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17082{
17083 if (bImm & 0x1)
17084 {
17085 /* Words -> 8 elements. */
17086 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17087 if (puSrc->au16[i] == 0)
17088 return i;
17089
17090 return 8;
17091 }
17092 else
17093 {
17094 /* Bytes -> 16 elements. */
17095 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17096 if (puSrc->au8[i] == 0)
17097 return i;
17098
17099 return 16;
17100 }
17101}
17102
17103static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17104{
17105 if (bImm & 0x1)
17106 {
17107 if (i64Len > -8 && i64Len < 8)
17108 return RT_ABS(i64Len);
17109
17110 return 8;
17111 }
17112 else
17113 {
17114 if (i64Len > -16 && i64Len < 16)
17115 return RT_ABS(i64Len);
17116
17117 return 16;
17118 }
17119}
17120
17121/**
17122 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17123 */
17124static const bool g_afCmpOverride[4][3] =
17125{
17126 /* xmm1 AND xmm2/m128 invalid xmm1 invalid, xmm2/m128 valid xmm1 valid, xmm2/m128 invalid */
17127 { false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17128 { false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17129 { true, false, false }, /* Imm8[3:2] = 10b (equal each) */
17130 { true, true, false }, /* Imm8[3:2] = 11b (equal ordered) */
17131};
17132
17133DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17134{
17135 if (fSrc1Valid && fSrc2Valid)
17136 return fCmpRes;
17137
17138 uint8_t bSrc1Valid = fSrc1Valid ? 2 : 0;
17139 uint8_t bSrc2Valid = fSrc2Valid ? 1 : 0;
17140 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17141}
17142
17143static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
17144{
17145 uint8_t bAggOp = (bImm >> 2) & 0x3;
17146 uint16_t u16Result = 0;
17147
17148 switch (bAggOp)
17149 {
17150 case 0: /* Equal any */
17151 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17152 {
17153 uint16_t u16Res = 0;
17154 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
17155 {
17156 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17157 idxSrc1 < idxLen1,
17158 idxSrc2 < idxLen2,
17159 bAggOp))
17160 {
17161 u16Res = RT_BIT(idxSrc2);
17162 break;
17163 }
17164 }
17165
17166 u16Result |= u16Res;
17167 }
17168 break;
17169
17170 case 1: /* Ranges */
17171 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17172 {
17173 uint16_t u16Res = 0;
17174 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
17175 {
17176 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17177 idxSrc1 < idxLen1,
17178 idxSrc2 < idxLen2,
17179 bAggOp)
17180 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
17181 (idxSrc1 + 1) < idxLen1,
17182 idxSrc2 < idxLen2,
17183 bAggOp))
17184 {
17185 u16Res = RT_BIT(idxSrc2);
17186 break;
17187 }
17188 }
17189
17190 u16Result |= u16Res;
17191 }
17192 break;
17193
17194 case 2: /* Equal each */
17195 for (uint8_t i = 0; i < cElems; i++)
17196 {
17197 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
17198 i < idxLen1,
17199 i < idxLen2,
17200 bAggOp))
17201 u16Result |= RT_BIT(i);
17202 }
17203 break;
17204
17205 case 3: /* Equal ordered */
17206 u16Result = 0;
17207 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17208 {
17209 uint16_t u16Res = RT_BIT(idxSrc2);
17210 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
17211 {
17212 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
17213 idxSrc1 < idxLen1,
17214 k < idxLen2,
17215 bAggOp))
17216 {
17217 u16Res = 0;
17218 break;
17219 }
17220 }
17221
17222 u16Result |= u16Res;
17223 }
17224 break;
17225 }
17226
17227 /* Polarity selection. */
17228 switch ((bImm >> 4) & 0x3)
17229 {
17230 case 0:
17231 case 2:
17232 /* Nothing to do. */
17233 break;
17234 case 1:
17235 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
17236 break;
17237 case 3:
17238 u16Result ^= RT_BIT(idxLen2) - 1;
17239 break;
17240 default:
17241 AssertReleaseFailed();
17242 }
17243
17244 return u16Result;
17245}
17246
17247DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
17248{
17249 uint32_t fEFlags = 0;
17250
17251 if (u16Result)
17252 fEFlags |= X86_EFL_CF;
17253 if (cLen2 < cElems)
17254 fEFlags |= X86_EFL_ZF;
17255 if (cLen1 < cElems)
17256 fEFlags |= X86_EFL_SF;
17257 if (u16Result & 0x1)
17258 fEFlags |= X86_EFL_OF;
17259 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
17260}
17261
17262DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
17263 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
17264{
17265 bool afCmpRes[16][16];
17266 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17267
17268 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
17269 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
17270 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
17271
17272 return u16Result;
17273}
17274
17275DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17276{
17277 if (bImm & RT_BIT(6))
17278 {
17279 /* Index for MSB set. */
17280 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
17281 if (idxMsb)
17282 *pu32Ecx = idxMsb - 1;
17283 else
17284 *pu32Ecx = cElems;
17285 }
17286 else
17287 {
17288 /* Index for LSB set. */
17289 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
17290 if (idxLsb)
17291 *pu32Ecx = idxLsb - 1;
17292 else
17293 *pu32Ecx = cElems;
17294 }
17295}
17296
17297IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17298{
17299 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17300 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17301 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17302
17303 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17304 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17305}
17306
17307
17308/**
17309 * [V]PCMPESTRI
17310 */
17311IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17312{
17313 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17314 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17315 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17316
17317 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17318 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17319}
17320
17321
17322/**
17323 * [V]PCMPISTRM
17324 */
17325DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17326{
17327 if (bImm & RT_BIT(6))
17328 {
17329 /* Generate a mask. */
17330 if (cElems == 8)
17331 {
17332 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17333 if (u16Result & RT_BIT(i))
17334 puDst->au16[i] = 0xffff;
17335 else
17336 puDst->au16[i] = 0;
17337 }
17338 else
17339 {
17340 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17341 if (u16Result & RT_BIT(i))
17342 puDst->au8[i] = 0xff;
17343 else
17344 puDst->au8[i] = 0;
17345 }
17346 }
17347 else
17348 {
17349 /* Store the result. */
17350 puDst->au64[0] = u16Result;
17351 puDst->au64[1] = 0;
17352 }
17353}
17354
17355IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17356{
17357 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17358 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17359 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17360
17361 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17362 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
17363}
17364
17365
17366/**
17367 * [V]PCMPESTRM
17368 */
17369IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17370{
17371 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17372 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17373 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17374
17375 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17376 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
17377}
17378
17379
17380/*
17381 * [V]PCLMULQDQ
17382 */
17383IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17384{
17385 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
17386}
17387
17388
17389IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17390{
17391 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
17392 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
17393
17394 puDst->au64[0] = 0;
17395 puDst->au64[1] = 0;
17396
17397 /*
17398 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
17399 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
17400 * and squeeze out some optimizations.
17401 */
17402 if (uSrc1 & 0x1)
17403 puDst->au64[0] = uSrc2;
17404
17405 uSrc1 >>= 1;
17406
17407 uint8_t iDigit = 1;
17408 while (uSrc1)
17409 {
17410 if (uSrc1 & 0x1)
17411 {
17412 puDst->au64[0] ^= (uSrc2 << iDigit);
17413 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
17414 }
17415
17416 uSrc1 >>= 1;
17417 iDigit++;
17418 }
17419}
17420
17421
17422/**
17423 * [V]PINSRW
17424 */
17425#ifdef IEM_WITHOUT_ASSEMBLY
17426IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
17427{
17428 uint8_t cShift = (bEvil & 0x3) * 16;
17429 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
17430}
17431
17432
17433IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
17434{
17435 puDst->au16[bEvil & 0x7] = u16Src;
17436}
17437#endif
17438
17439
17440IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
17441{
17442 *puDst = *puSrc;
17443 puDst->au16[bEvil & 0x7] = u16Src;
17444}
17445
17446
17447/**
17448 * [V]PEXTRW
17449 */
17450#ifdef IEM_WITHOUT_ASSEMBLY
17451IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
17452{
17453 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
17454}
17455
17456
17457IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
17458{
17459 *pu16Dst = puSrc->au16[bEvil & 0x7];
17460}
17461
17462#endif
17463
17464IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
17465{
17466 *pu16Dst = puSrc->au16[bEvil & 0x7];
17467}
17468
17469
17470/**
17471 * [V]MOVMSKPS
17472 */
17473#ifdef IEM_WITHOUT_ASSEMBLY
17474IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17475{
17476 *pu8Dst = puSrc->au32[0] >> 31;
17477 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17478 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17479 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17480}
17481
17482#endif
17483
17484IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17485{
17486 *pu8Dst = puSrc->au32[0] >> 31;
17487 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17488 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17489 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17490}
17491
17492
17493IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17494{
17495 *pu8Dst = puSrc->au32[0] >> 31;
17496 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17497 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17498 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17499 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
17500 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
17501 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
17502 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
17503}
17504
17505
17506/**
17507 * [V]MOVMSKPD
17508 */
17509#ifdef IEM_WITHOUT_ASSEMBLY
17510IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17511{
17512 *pu8Dst = puSrc->au64[0] >> 63;
17513 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17514}
17515
17516#endif
17517
17518IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17519{
17520 *pu8Dst = puSrc->au64[0] >> 63;
17521 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17522}
17523
17524
17525IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17526{
17527 *pu8Dst = puSrc->au64[0] >> 63;
17528 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17529 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
17530 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
17531}
17532
17533
17534/**
17535 * CVTTSD2SI
17536 */
17537#ifdef IEM_WITHOUT_ASSEMBLY
17538IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17539{
17540 RTFLOAT64U r64Src;
17541
17542 r64Src.u = *pu64Src;
17543 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17544
17545 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17546 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17547 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17548}
17549
17550
17551IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17552{
17553 RTFLOAT64U r64Src;
17554
17555 r64Src.u = *pu64Src;
17556 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17557
17558 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17559 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17560 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17561}
17562#endif
17563
17564
17565/**
17566 * CVTSD2SI
17567 */
17568#ifdef IEM_WITHOUT_ASSEMBLY
17569IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17570{
17571 RTFLOAT64U r64Src;
17572
17573 r64Src.u = *pu64Src;
17574 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17575
17576 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17577 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17578 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17579}
17580
17581
17582IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17583{
17584 RTFLOAT64U r64Src;
17585
17586 r64Src.u = *pu64Src;
17587 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17588
17589 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17590 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17591 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17592}
17593#endif
17594
17595
17596/**
17597 * CVTTSS2SI
17598 */
17599#ifdef IEM_WITHOUT_ASSEMBLY
17600IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17601{
17602 RTFLOAT32U r32Src;
17603
17604 r32Src.u = *pu32Src;
17605 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17606
17607 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17608 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17609 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17610}
17611
17612
17613IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17614{
17615 RTFLOAT32U r32Src;
17616
17617 r32Src.u = *pu32Src;
17618 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17619
17620 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17621 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17622 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17623}
17624#endif
17625
17626
17627/**
17628 * CVTSS2SI
17629 */
17630#ifdef IEM_WITHOUT_ASSEMBLY
17631IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17632{
17633 RTFLOAT32U r32Src;
17634
17635 r32Src.u = *pu32Src;
17636 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17637
17638 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17639 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17640 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17641}
17642
17643
17644IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17645{
17646 RTFLOAT32U r32Src;
17647
17648 r32Src.u = *pu32Src;
17649 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17650
17651 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17652 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17653 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17654}
17655#endif
17656
17657
17658/**
17659 * CVTSI2SD
17660 */
17661#ifdef IEM_WITHOUT_ASSEMBLY
17662IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
17663{
17664 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17665 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
17666 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17667}
17668
17669
17670IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
17671{
17672 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17673 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
17674 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17675}
17676#endif
17677
17678
17679/**
17680 * CVTSI2SS
17681 */
17682#ifdef IEM_WITHOUT_ASSEMBLY
17683IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
17684{
17685 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17686 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
17687 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17688}
17689
17690
17691IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
17692{
17693 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17694 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
17695 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17696}
17697#endif
17698
17699
17700/**
17701 * [V]UCOMISS
17702 */
17703#ifdef IEM_WITHOUT_ASSEMBLY
17704IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17705{
17706 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17707
17708 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
17709 {
17710 *pfMxcsr |= X86_MXCSR_IE;
17711 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17712 }
17713 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17714 {
17715 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17716 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17717 }
17718 else
17719 {
17720 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17721
17722 RTFLOAT32U r32Src1, r32Src2;
17723 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17724 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17725
17726 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17727 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17728 if (f32_eq(f32Src1, f32Src2, &SoftState))
17729 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17730 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17731 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17732 /* else: GREATER_THAN 000 */
17733
17734 *pfMxcsr |= fDe;
17735 }
17736
17737 *pfEFlags = fEFlagsNew;
17738}
17739#endif
17740
17741IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17742{
17743 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17744}
17745
17746
17747/**
17748 * [V]UCOMISD
17749 */
17750#ifdef IEM_WITHOUT_ASSEMBLY
17751IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17752{
17753 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17754
17755 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
17756 {
17757 *pfMxcsr |= X86_MXCSR_IE;
17758 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17759 }
17760 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17761 {
17762 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17763 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17764 }
17765 else
17766 {
17767 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17768
17769 RTFLOAT64U r64Src1, r64Src2;
17770 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17771 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17772
17773 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17774 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17775 if (f64_eq(f64Src1, f64Src2, &SoftState))
17776 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17777 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17778 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17779 /* else: GREATER_THAN 000 */
17780
17781 *pfMxcsr |= fDe;
17782 }
17783
17784 *pfEFlags = fEFlagsNew;
17785}
17786#endif
17787
17788IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17789{
17790 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17791}
17792
17793
17794/**
17795 * [V]COMISS
17796 */
17797#ifdef IEM_WITHOUT_ASSEMBLY
17798IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17799{
17800 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17801
17802 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
17803 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17804 {
17805 *pfMxcsr |= X86_MXCSR_IE;
17806 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17807 }
17808 else
17809 {
17810 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17811
17812 RTFLOAT32U r32Src1, r32Src2;
17813 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17814 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17815
17816 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17817 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17818 if (f32_eq(f32Src1, f32Src2, &SoftState))
17819 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17820 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17821 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17822 /* else: GREATER_THAN 000 */
17823
17824 *pfMxcsr |= fDe;
17825 }
17826
17827 *pfEFlags = fEFlagsNew;
17828}
17829#endif
17830
17831
17832IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17833{
17834 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17835}
17836
17837
17838/**
17839 * [V]COMISD
17840 */
17841#ifdef IEM_WITHOUT_ASSEMBLY
17842IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17843{
17844 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17845
17846 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
17847 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17848 {
17849 *pfMxcsr |= X86_MXCSR_IE;
17850 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17851 }
17852 else
17853 {
17854 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17855
17856 RTFLOAT64U r64Src1, r64Src2;
17857 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17858 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17859
17860 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17861 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17862 if (f64_eq(f64Src1, f64Src2, &SoftState))
17863 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17864 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17865 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17866 /* else: GREATER_THAN 000 */
17867
17868 *pfMxcsr |= fDe;
17869 }
17870
17871 *pfEFlags = fEFlagsNew;
17872}
17873#endif
17874
17875IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17876{
17877 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17878}
17879
17880
17881/**
17882 * CMPPS / CMPPD / CMPSS / CMPSD
17883 */
17884#ifdef IEM_WITHOUT_ASSEMBLY
17885/**
17886 * A compare truth table entry.
17887 */
17888typedef struct CMPTRUTHTBLENTRY
17889{
17890 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
17891 bool fSignalsOnQNan;
17892 /** The boolean result when the input operands are unordered. */
17893 bool fUnordered;
17894 /** The boolean result when A = B. */
17895 bool fEqual;
17896 /** The boolean result when A < B. */
17897 bool fLowerThan;
17898 /** The boolean result when A > B. */
17899 bool fGreaterThan;
17900} CMPTRUTHTBLENTRY;
17901/** Pointer to a const truth table entry. */
17902typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
17903
17904
17905/** The compare truth table (indexed by immediate). */
17906static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
17907{
17908 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
17909 /* 00H (EQ_OQ) */ { false, false, true, false, false },
17910 /* 01H (LT_OS) */ { true, false, false, true, false },
17911 /* 02H (LE_OS) */ { true, false, true, true, false },
17912 /* 03H (UNORD_Q) */ { false, true, false, false, false },
17913 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
17914 /* 05H (NLT_US) */ { true, true, true, false, true },
17915 /* 06H (NLE_US) */ { true, true, false, false, true },
17916 /* 07H (ORQ_Q) */ { false, false, true, true, true },
17917 /** @todo AVX variants. */
17918};
17919
17920
17921static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
17922{
17923 bool fRes;
17924 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17925
17926 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
17927 {
17928 *pfMxcsr |= X86_MXCSR_IE;
17929 fRes = g_aCmpTbl[bEvil].fUnordered;
17930 }
17931 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
17932 {
17933 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17934 *pfMxcsr |= X86_MXCSR_IE;
17935 fRes = g_aCmpTbl[bEvil].fUnordered;
17936 }
17937 else
17938 {
17939 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17940
17941 RTFLOAT32U r32Src1, r32Src2;
17942 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
17943 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
17944
17945 *pfMxcsr |= fDe;
17946 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17947 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17948 if (f32_eq(f32Src1, f32Src2, &SoftState))
17949 fRes = g_aCmpTbl[bEvil].fEqual;
17950 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17951 fRes = g_aCmpTbl[bEvil].fLowerThan;
17952 else
17953 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17954 }
17955
17956 return fRes;
17957}
17958
17959
17960static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
17961{
17962 bool fRes;
17963 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17964
17965 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
17966 {
17967 *pfMxcsr |= X86_MXCSR_IE;
17968 fRes = g_aCmpTbl[bEvil].fUnordered;
17969 }
17970 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
17971 {
17972 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17973 *pfMxcsr |= X86_MXCSR_IE;
17974 fRes = g_aCmpTbl[bEvil].fUnordered;
17975 }
17976 else
17977 {
17978 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17979
17980 RTFLOAT64U r64Src1, r64Src2;
17981 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
17982 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
17983
17984 *pfMxcsr |= fDe;
17985 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17986 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17987 if (f64_eq(f64Src1, f64Src2, &SoftState))
17988 fRes = g_aCmpTbl[bEvil].fEqual;
17989 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17990 fRes = g_aCmpTbl[bEvil].fLowerThan;
17991 else
17992 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17993 }
17994
17995 return fRes;
17996}
17997
17998
17999IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18000{
18001 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18002 {
18003 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18004 puDst->au32[i] = UINT32_MAX;
18005 else
18006 puDst->au32[i] = 0;
18007 }
18008}
18009
18010
18011IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18012{
18013 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18014 {
18015 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18016 puDst->au64[i] = UINT64_MAX;
18017 else
18018 puDst->au64[i] = 0;
18019 }
18020}
18021
18022
18023IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18024{
18025 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18026 puDst->au32[0] = UINT32_MAX;
18027 else
18028 puDst->au32[0] = 0;
18029
18030 puDst->au32[1] = pSrc->uSrc1.au32[1];
18031 puDst->au64[1] = pSrc->uSrc1.au64[1];
18032}
18033
18034
18035IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18036{
18037 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18038 puDst->au64[0] = UINT64_MAX;
18039 else
18040 puDst->au64[0] = 0;
18041
18042 puDst->au64[1] = pSrc->uSrc1.au64[1];
18043}
18044#endif
18045
18046
18047/**
18048 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18049 */
18050
18051#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18052#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18053#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18054
18055#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18056
18057DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18058{
18059 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18060 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18061
18062 fMxcsr &= ~X86_MXCSR_RC_MASK;
18063 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18064 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18065}
18066
18067static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18068{
18069 RTFLOAT32U r32Src, r32Dst;
18070 float32_t f32Src;
18071 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18072 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18073
18074 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18075 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18076
18077 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18078 return r32Dst;
18079}
18080
18081static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18082{
18083 RTFLOAT64U r64Src, r64Dst;
18084 float64_t f64Src;
18085 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18086 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18087
18088 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18089 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18090
18091 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18092 return r64Dst;
18093}
18094
18095#ifdef IEM_WITHOUT_ASSEMBLY
18096IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18097{
18098 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18099 puDst->au32[1] = pSrc->uSrc1.au32[1];
18100 puDst->au64[1] = pSrc->uSrc1.au64[1];
18101}
18102
18103
18104IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18105{
18106 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18107 puDst->au64[1] = pSrc->uSrc1.au64[1];
18108}
18109#endif
18110
18111IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18112{
18113 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18114 {
18115 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18116 }
18117}
18118
18119
18120IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18121{
18122 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18123 {
18124 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18125 }
18126}
18127
18128/**
18129 * CVTPD2PI
18130 */
18131#ifdef IEM_WITHOUT_ASSEMBLY
18132static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18133{
18134 RTFLOAT64U r64Src;
18135 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18136
18137 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18138 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18139 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18140}
18141
18142
18143IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18144{
18145 RTUINT64U u64Res;
18146 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18147 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18148
18149 *pu64Dst = u64Res.u;
18150 *pfMxcsr = fMxcsrOut;
18151}
18152#endif
18153
18154
18155/**
18156 * CVTTPD2PI
18157 */
18158#ifdef IEM_WITHOUT_ASSEMBLY
18159static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18160{
18161 RTFLOAT64U r64Src;
18162 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18163
18164 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18165 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18166 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18167}
18168
18169
18170IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18171{
18172 RTUINT64U u64Res;
18173 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18174 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18175
18176 *pu64Dst = u64Res.u;
18177 *pfMxcsr = fMxcsrOut;
18178}
18179#endif
18180
18181
18182/**
18183 * CVTPI2PS
18184 */
18185#ifdef IEM_WITHOUT_ASSEMBLY
18186static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
18187{
18188 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18189 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
18190 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
18191}
18192
18193
18194IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
18195{
18196 RTUINT64U uSrc = { u64Src };
18197 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
18198 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
18199 *pfMxcsr = fMxcsrOut;
18200}
18201#endif
18202
18203
18204/**
18205 * CVTPI2PD
18206 */
18207#ifdef IEM_WITHOUT_ASSEMBLY
18208static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
18209{
18210 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18211 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
18212 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
18213}
18214
18215
18216IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
18217{
18218 RTUINT64U uSrc = { u64Src };
18219 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
18220 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
18221 *pfMxcsr = fMxcsrOut;
18222}
18223#endif
18224
18225
18226/**
18227 * CVTPS2PI
18228 */
18229#ifdef IEM_WITHOUT_ASSEMBLY
18230static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18231{
18232 RTFLOAT32U r32Src;
18233 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18234
18235 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18236 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18237 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18238}
18239
18240
18241IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
18242{
18243 RTUINT64U uDst;
18244 RTUINT64U uSrc = { u64Src };
18245 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18246 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18247 *pu64Dst = uDst.u;
18248 *pfMxcsr = fMxcsrOut;
18249}
18250#endif
18251
18252
18253/**
18254 * CVTTPS2PI
18255 */
18256#ifdef IEM_WITHOUT_ASSEMBLY
18257static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18258{
18259 RTFLOAT32U r32Src;
18260 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18261
18262 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18263 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18264 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18265}
18266
18267
18268IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
18269{
18270 RTUINT64U uDst;
18271 RTUINT64U uSrc = { u64Src };
18272 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18273 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18274 *pu64Dst = uDst.u;
18275 *pfMxcsr = fMxcsrOut;
18276}
18277#endif
18278
18279/**
18280 * RDRAND
18281 */
18282IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18283{
18284 *puDst = 0;
18285 *pEFlags &= ~X86_EFL_STATUS_BITS;
18286 *pEFlags |= X86_EFL_CF;
18287}
18288
18289IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18290{
18291 *puDst = 0;
18292 *pEFlags &= ~X86_EFL_STATUS_BITS;
18293 *pEFlags |= X86_EFL_CF;
18294}
18295
18296IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18297{
18298 *puDst = 0;
18299 *pEFlags &= ~X86_EFL_STATUS_BITS;
18300 *pEFlags |= X86_EFL_CF;
18301}
18302
18303/**
18304 * RDSEED
18305 */
18306IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18307{
18308 *puDst = 0;
18309 *pEFlags &= ~X86_EFL_STATUS_BITS;
18310 *pEFlags |= X86_EFL_CF;
18311}
18312
18313IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18314{
18315 *puDst = 0;
18316 *pEFlags &= ~X86_EFL_STATUS_BITS;
18317 *pEFlags |= X86_EFL_CF;
18318}
18319
18320IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18321{
18322 *puDst = 0;
18323 *pEFlags &= ~X86_EFL_STATUS_BITS;
18324 *pEFlags |= X86_EFL_CF;
18325}
18326
18327
18328/**
18329 * SHA1NEXTE
18330 */
18331IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18332{
18333 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
18334
18335 puDst->au32[0] = puSrc->au32[0];
18336 puDst->au32[1] = puSrc->au32[1];
18337 puDst->au32[2] = puSrc->au32[2];
18338 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
18339}
18340
18341/**
18342 * SHA1MSG1
18343 */
18344IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18345{
18346 uint32_t u32W0 = puDst->au32[3];
18347 uint32_t u32W1 = puDst->au32[2];
18348 uint32_t u32W2 = puDst->au32[1];
18349 uint32_t u32W3 = puDst->au32[0];
18350 uint32_t u32W4 = puSrc->au32[3];
18351 uint32_t u32W5 = puSrc->au32[2];
18352
18353 puDst->au32[3] = u32W2 ^ u32W0;
18354 puDst->au32[2] = u32W3 ^ u32W1;
18355 puDst->au32[1] = u32W4 ^ u32W2;
18356 puDst->au32[0] = u32W5 ^ u32W3;
18357}
18358
18359/**
18360 * SHA1MSG2
18361 */
18362IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18363{
18364 uint32_t u32W13 = puSrc->au32[2];
18365 uint32_t u32W14 = puSrc->au32[1];
18366 uint32_t u32W15 = puSrc->au32[0];
18367 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
18368 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
18369 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
18370 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
18371
18372 puDst->au32[3] = u32W16;
18373 puDst->au32[2] = u32W17;
18374 puDst->au32[1] = u32W18;
18375 puDst->au32[0] = u32W19;
18376}
18377
18378/**
18379 * SHA1RNDS4
18380 */
18381typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
18382typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
18383
18384static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18385{
18386 return (u32B & u32C) ^ (~u32B & u32D);
18387}
18388
18389static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18390{
18391 return u32B ^ u32C ^ u32D;
18392}
18393
18394static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18395{
18396 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
18397}
18398
18399static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18400{
18401 return u32B ^ u32C ^ u32D;
18402}
18403
18404IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18405{
18406 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
18407 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
18408
18409 uint32_t au32A[5];
18410 uint32_t au32B[5];
18411 uint32_t au32C[5];
18412 uint32_t au32D[5];
18413 uint32_t au32E[5];
18414 uint32_t au32W[4];
18415 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
18416 uint32_t u32K = s_au32K[bEvil & 0x3];
18417
18418 au32A[0] = puDst->au32[3];
18419 au32B[0] = puDst->au32[2];
18420 au32C[0] = puDst->au32[1];
18421 au32D[0] = puDst->au32[0];
18422 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
18423 au32W[i] = puSrc->au32[3 - i];
18424
18425 /* Round 0 is a bit different than the other rounds. */
18426 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
18427 au32B[1] = au32A[0];
18428 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
18429 au32D[1] = au32C[0];
18430 au32E[1] = au32D[0];
18431
18432 for (uint32_t i = 1; i <= 3; i++)
18433 {
18434 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
18435 au32B[i + 1] = au32A[i];
18436 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
18437 au32D[i + 1] = au32C[i];
18438 au32E[i + 1] = au32D[i];
18439 }
18440
18441 puDst->au32[3] = au32A[4];
18442 puDst->au32[2] = au32B[4];
18443 puDst->au32[1] = au32C[4];
18444 puDst->au32[0] = au32D[4];
18445}
18446
18447
18448/**
18449 * SHA256MSG1
18450 */
18451DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
18452{
18453 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
18454}
18455
18456IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18457{
18458 uint32_t u32W4 = puSrc->au32[0];
18459 uint32_t u32W3 = puDst->au32[3];
18460 uint32_t u32W2 = puDst->au32[2];
18461 uint32_t u32W1 = puDst->au32[1];
18462 uint32_t u32W0 = puDst->au32[0];
18463
18464 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
18465 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
18466 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
18467 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
18468}
18469
18470/**
18471 * SHA256MSG2
18472 */
18473DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
18474{
18475 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
18476}
18477
18478IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18479{
18480 uint32_t u32W14 = puSrc->au32[2];
18481 uint32_t u32W15 = puSrc->au32[3];
18482 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
18483 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
18484 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
18485 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
18486
18487 puDst->au32[3] = u32W19;
18488 puDst->au32[2] = u32W18;
18489 puDst->au32[1] = u32W17;
18490 puDst->au32[0] = u32W16;
18491}
18492
18493/**
18494 * SHA256RNDS2
18495 */
18496DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18497{
18498 return (u32X & u32Y) ^ (~u32X & u32Z);
18499}
18500
18501DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18502{
18503 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
18504}
18505
18506DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
18507{
18508 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
18509}
18510
18511DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
18512{
18513 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
18514}
18515
18516IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
18517{
18518 uint32_t au32A[3];
18519 uint32_t au32B[3];
18520 uint32_t au32C[3];
18521 uint32_t au32D[3];
18522 uint32_t au32E[3];
18523 uint32_t au32F[3];
18524 uint32_t au32G[3];
18525 uint32_t au32H[3];
18526 uint32_t au32WK[2];
18527
18528 au32A[0] = puSrc->au32[3];
18529 au32B[0] = puSrc->au32[2];
18530 au32C[0] = puDst->au32[3];
18531 au32D[0] = puDst->au32[2];
18532 au32E[0] = puSrc->au32[1];
18533 au32F[0] = puSrc->au32[0];
18534 au32G[0] = puDst->au32[1];
18535 au32H[0] = puDst->au32[0];
18536
18537 au32WK[0] = puXmm0Constants->au32[0];
18538 au32WK[1] = puXmm0Constants->au32[1];
18539
18540 for (uint32_t i = 0; i < 2; i++)
18541 {
18542 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18543 + iemAImpl_sha256_upper_sigma1(au32E[i])
18544 + au32WK[i]
18545 + au32H[i]
18546 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
18547 + iemAImpl_sha256_upper_sigma0(au32A[i]);
18548 au32B[i + 1] = au32A[i];
18549 au32C[i + 1] = au32B[i];
18550 au32D[i + 1] = au32C[i];
18551 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18552 + iemAImpl_sha256_upper_sigma1(au32E[i])
18553 + au32WK[i]
18554 + au32H[i]
18555 + au32D[i];
18556 au32F[i + 1] = au32E[i];
18557 au32G[i + 1] = au32F[i];
18558 au32H[i + 1] = au32G[i];
18559 }
18560
18561 puDst->au32[3] = au32A[2];
18562 puDst->au32[2] = au32B[2];
18563 puDst->au32[1] = au32E[2];
18564 puDst->au32[0] = au32F[2];
18565}
18566
18567
18568/**
18569 * ADCX
18570 */
18571#define ADX_EMIT(a_Flag, a_Type, a_Max) \
18572 do \
18573 { \
18574 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
18575 a_Type uTmp = *puDst + uSrc; \
18576 if (uTmp < uSrc) \
18577 *pfEFlags |= (a_Flag); \
18578 else \
18579 *pfEFlags &= ~(a_Flag); \
18580 if ( uTmp == a_Max \
18581 && f) \
18582 *pfEFlags |= (a_Flag); \
18583 if (f) \
18584 uTmp++; \
18585 *puDst = uTmp; \
18586 } \
18587 while (0)
18588
18589IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
18590{
18591 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
18592}
18593
18594IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
18595{
18596 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
18597}
18598
18599# if defined(IEM_WITHOUT_ASSEMBLY)
18600
18601IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
18602{
18603 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
18604}
18605
18606IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
18607{
18608 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
18609}
18610
18611#endif
18612
18613
18614/**
18615 * ADOX
18616 */
18617IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
18618{
18619 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
18620}
18621
18622IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
18623{
18624 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
18625}
18626
18627# if defined(IEM_WITHOUT_ASSEMBLY)
18628
18629IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
18630{
18631 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
18632}
18633
18634IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
18635{
18636 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
18637}
18638
18639# endif
18640
18641
18642/**
18643 * MPSADBW
18644 */
18645IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18646{
18647 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18648 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18649 int16_t ai16Src1[11];
18650 int16_t ai16Src2[4];
18651
18652 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18653 ai16Src1[i] = puDst->au8[idxSrc1 + i];
18654
18655 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18656 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
18657
18658 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18659 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18660 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18661 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18662 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18663}
18664
18665
18666IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18667{
18668 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18669 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18670 int16_t ai16Src1[11];
18671 int16_t ai16Src2[4];
18672
18673 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18674 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
18675
18676 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18677 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
18678
18679 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18680 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18681 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18682 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18683 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18684}
18685
18686
18687IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
18688{
18689 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
18690 RTUINT256U const uSrc2 = *puSrc2;
18691 ASMCompilerBarrier();
18692 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
18693 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
18694}
18695
18696
18697/**
18698 * VPERM2I128
18699 */
18700IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
18701{
18702 if (bImm & RT_BIT(3))
18703 {
18704 puDst->au64[0] = 0;
18705 puDst->au64[1] = 0;
18706 }
18707 else
18708 {
18709 switch (bImm & 0x3)
18710 {
18711 case 0:
18712 puDst->au64[0] = puSrc1->au64[0];
18713 puDst->au64[1] = puSrc1->au64[1];
18714 break;
18715 case 1:
18716 puDst->au64[0] = puSrc1->au64[2];
18717 puDst->au64[1] = puSrc1->au64[3];
18718 break;
18719 case 2:
18720 puDst->au64[0] = puSrc2->au64[0];
18721 puDst->au64[1] = puSrc2->au64[1];
18722 break;
18723 case 3:
18724 puDst->au64[0] = puSrc2->au64[2];
18725 puDst->au64[1] = puSrc2->au64[3];
18726 break;
18727 }
18728 }
18729
18730 if (bImm & RT_BIT(7))
18731 {
18732 puDst->au64[2] = 0;
18733 puDst->au64[3] = 0;
18734 }
18735 else
18736 {
18737 switch ((bImm >> 4) & 0x3)
18738 {
18739 case 0:
18740 puDst->au64[2] = puSrc1->au64[0];
18741 puDst->au64[3] = puSrc1->au64[1];
18742 break;
18743 case 1:
18744 puDst->au64[2] = puSrc1->au64[2];
18745 puDst->au64[3] = puSrc1->au64[3];
18746 break;
18747 case 2:
18748 puDst->au64[2] = puSrc2->au64[0];
18749 puDst->au64[3] = puSrc2->au64[1];
18750 break;
18751 case 3:
18752 puDst->au64[2] = puSrc2->au64[2];
18753 puDst->au64[3] = puSrc2->au64[3];
18754 break;
18755 }
18756 }
18757}
18758
18759
18760/**
18761 * VPERM2F128
18762 */
18763IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
18764{
18765 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
18766}
18767
18768
18769/**
18770 * DPPS
18771 */
18772IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18773{
18774 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18775 AssertReleaseFailed();
18776}
18777
18778
18779/**
18780 * DPPD
18781 */
18782IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18783{
18784 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18785 AssertReleaseFailed();
18786}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette