VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 104298

Last change on this file since 104298 was 104296, checked in by vboxsync, 12 months ago

VMM/IEM: ARM assembly rendition of RCL. bugref:10376

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 729.2 KB
Line 
1/* $Id: IEMAllAImplC.cpp 104296 2024-04-11 13:03:03Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Calculates the parity flag.
87 *
88 * @returns X86_EFL_PF or 0.
89 * @param a_uResult Unsigned result value.
90 */
91#if !defined(RT_ARCH_ARM64) || 1 /** @todo profile this... micro benching in tstIEMAImpl indicates no gain, but it may be skewed. */
92# define IEM_EFL_CALC_PARITY(a_uResult) (g_afParity[(a_uResult) & 0xff])
93#else
94# define IEM_EFL_CALC_PARITY(a_uResult) iemAImplCalcParity(a_uResult)
95DECL_FORCE_INLINE(uint32_t) iemAImplCalcParity(uint32_t uResult)
96{
97 /* Emulate 8-bit pop count. This translates to 4 EOR instructions on
98 ARM64 as they can shift the 2nd source operand. */
99 uint8_t bPf = uResult ^ (uResult >> 4);
100 bPf ^= bPf >> 2;
101 bPf ^= bPf >> 1;
102 bPf ^= 1;
103 return (bPf & 1) << X86_EFL_PF_BIT;
104}
105#endif
106
107/**
108 * Extracts the OF flag from a OF calculation result.
109 *
110 * These are typically used by concating with a bitcount. The problem is that
111 * 8-bit values needs shifting in the other direction than the others.
112 */
113#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
114#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
115#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
116#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
117
118/**
119 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
120 *
121 * @returns Status bits.
122 * @param a_fEFlagsVar The 32-bit EFLAGS variable to update.
123 * @param a_uResult Unsigned result value.
124 * @param a_uSrc The source value (for AF calc).
125 * @param a_uDst The original destination value (for AF+OF calc).
126 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
127 * @param a_CfExpr Bool expression for the carry flag (CF).
128 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
129 */
130#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_fEFlagsVar, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
131 do { \
132 a_fEFlagsVar &= ~X86_EFL_STATUS_BITS; \
133 a_fEFlagsVar |= (a_CfExpr) << X86_EFL_CF_BIT; \
134 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(a_uResult); \
135 a_fEFlagsVar |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
136 a_fEFlagsVar |= X86_EFL_CALC_ZF(a_uResult); \
137 a_fEFlagsVar |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
138 \
139 /* Overflow during ADDition happens when both inputs have the same signed \
140 bit value and the result has a different sign bit value. \
141 \
142 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
143 follows that for SUBtraction the signed bit value must differ between \
144 the two inputs and the result's signed bit diff from the first input. \
145 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
146 \
147 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
148 a_fEFlagsVar |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
149 & RT_BIT_64(a_cBitsWidth - 1)) \
150 & ((a_uResult) ^ (a_uDst)) ); \
151 } while (0)
152
153/**
154 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
155 *
156 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
157 * undefined. We clear AF, as that seems to make the most sense and also seems
158 * to be the correct behavior on current CPUs.
159 *
160 * @returns Status bits.
161 * @param a_fEFlagsVar The 32-bit EFLAGS variable to update.
162 * @param a_uResult Unsigned result value.
163 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
164 * @param a_fExtra Additional bits to set.
165 */
166#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(a_fEFlagsVar, a_uResult, a_cBitsWidth, a_fExtra) \
167 do { \
168 a_fEFlagsVar &= ~X86_EFL_STATUS_BITS; \
169 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(a_uResult); \
170 a_fEFlagsVar |= X86_EFL_CALC_ZF(a_uResult); \
171 a_fEFlagsVar |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
172 a_fEFlagsVar |= (a_fExtra); \
173 } while (0)
174
175
176/*********************************************************************************************************************************
177* Global Variables *
178*********************************************************************************************************************************/
179/**
180 * Parity calculation table.
181 *
182 * This is also used by iemAllAImpl.asm.
183 *
184 * The generator code:
185 * @code
186 * #include <stdio.h>
187 *
188 * int main()
189 * {
190 * unsigned b;
191 * for (b = 0; b < 256; b++)
192 * {
193 * int cOnes = ( b & 1)
194 * + ((b >> 1) & 1)
195 * + ((b >> 2) & 1)
196 * + ((b >> 3) & 1)
197 * + ((b >> 4) & 1)
198 * + ((b >> 5) & 1)
199 * + ((b >> 6) & 1)
200 * + ((b >> 7) & 1);
201 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
202 * b,
203 * (b >> 7) & 1,
204 * (b >> 6) & 1,
205 * (b >> 5) & 1,
206 * (b >> 4) & 1,
207 * (b >> 3) & 1,
208 * (b >> 2) & 1,
209 * (b >> 1) & 1,
210 * b & 1,
211 * cOnes & 1 ? "0" : "X86_EFL_PF");
212 * }
213 * return 0;
214 * }
215 * @endcode
216 */
217uint8_t const g_afParity[256] =
218{
219 /* 0000 = 00000000b */ X86_EFL_PF,
220 /* 0x01 = 00000001b */ 0,
221 /* 0x02 = 00000010b */ 0,
222 /* 0x03 = 00000011b */ X86_EFL_PF,
223 /* 0x04 = 00000100b */ 0,
224 /* 0x05 = 00000101b */ X86_EFL_PF,
225 /* 0x06 = 00000110b */ X86_EFL_PF,
226 /* 0x07 = 00000111b */ 0,
227 /* 0x08 = 00001000b */ 0,
228 /* 0x09 = 00001001b */ X86_EFL_PF,
229 /* 0x0a = 00001010b */ X86_EFL_PF,
230 /* 0x0b = 00001011b */ 0,
231 /* 0x0c = 00001100b */ X86_EFL_PF,
232 /* 0x0d = 00001101b */ 0,
233 /* 0x0e = 00001110b */ 0,
234 /* 0x0f = 00001111b */ X86_EFL_PF,
235 /* 0x10 = 00010000b */ 0,
236 /* 0x11 = 00010001b */ X86_EFL_PF,
237 /* 0x12 = 00010010b */ X86_EFL_PF,
238 /* 0x13 = 00010011b */ 0,
239 /* 0x14 = 00010100b */ X86_EFL_PF,
240 /* 0x15 = 00010101b */ 0,
241 /* 0x16 = 00010110b */ 0,
242 /* 0x17 = 00010111b */ X86_EFL_PF,
243 /* 0x18 = 00011000b */ X86_EFL_PF,
244 /* 0x19 = 00011001b */ 0,
245 /* 0x1a = 00011010b */ 0,
246 /* 0x1b = 00011011b */ X86_EFL_PF,
247 /* 0x1c = 00011100b */ 0,
248 /* 0x1d = 00011101b */ X86_EFL_PF,
249 /* 0x1e = 00011110b */ X86_EFL_PF,
250 /* 0x1f = 00011111b */ 0,
251 /* 0x20 = 00100000b */ 0,
252 /* 0x21 = 00100001b */ X86_EFL_PF,
253 /* 0x22 = 00100010b */ X86_EFL_PF,
254 /* 0x23 = 00100011b */ 0,
255 /* 0x24 = 00100100b */ X86_EFL_PF,
256 /* 0x25 = 00100101b */ 0,
257 /* 0x26 = 00100110b */ 0,
258 /* 0x27 = 00100111b */ X86_EFL_PF,
259 /* 0x28 = 00101000b */ X86_EFL_PF,
260 /* 0x29 = 00101001b */ 0,
261 /* 0x2a = 00101010b */ 0,
262 /* 0x2b = 00101011b */ X86_EFL_PF,
263 /* 0x2c = 00101100b */ 0,
264 /* 0x2d = 00101101b */ X86_EFL_PF,
265 /* 0x2e = 00101110b */ X86_EFL_PF,
266 /* 0x2f = 00101111b */ 0,
267 /* 0x30 = 00110000b */ X86_EFL_PF,
268 /* 0x31 = 00110001b */ 0,
269 /* 0x32 = 00110010b */ 0,
270 /* 0x33 = 00110011b */ X86_EFL_PF,
271 /* 0x34 = 00110100b */ 0,
272 /* 0x35 = 00110101b */ X86_EFL_PF,
273 /* 0x36 = 00110110b */ X86_EFL_PF,
274 /* 0x37 = 00110111b */ 0,
275 /* 0x38 = 00111000b */ 0,
276 /* 0x39 = 00111001b */ X86_EFL_PF,
277 /* 0x3a = 00111010b */ X86_EFL_PF,
278 /* 0x3b = 00111011b */ 0,
279 /* 0x3c = 00111100b */ X86_EFL_PF,
280 /* 0x3d = 00111101b */ 0,
281 /* 0x3e = 00111110b */ 0,
282 /* 0x3f = 00111111b */ X86_EFL_PF,
283 /* 0x40 = 01000000b */ 0,
284 /* 0x41 = 01000001b */ X86_EFL_PF,
285 /* 0x42 = 01000010b */ X86_EFL_PF,
286 /* 0x43 = 01000011b */ 0,
287 /* 0x44 = 01000100b */ X86_EFL_PF,
288 /* 0x45 = 01000101b */ 0,
289 /* 0x46 = 01000110b */ 0,
290 /* 0x47 = 01000111b */ X86_EFL_PF,
291 /* 0x48 = 01001000b */ X86_EFL_PF,
292 /* 0x49 = 01001001b */ 0,
293 /* 0x4a = 01001010b */ 0,
294 /* 0x4b = 01001011b */ X86_EFL_PF,
295 /* 0x4c = 01001100b */ 0,
296 /* 0x4d = 01001101b */ X86_EFL_PF,
297 /* 0x4e = 01001110b */ X86_EFL_PF,
298 /* 0x4f = 01001111b */ 0,
299 /* 0x50 = 01010000b */ X86_EFL_PF,
300 /* 0x51 = 01010001b */ 0,
301 /* 0x52 = 01010010b */ 0,
302 /* 0x53 = 01010011b */ X86_EFL_PF,
303 /* 0x54 = 01010100b */ 0,
304 /* 0x55 = 01010101b */ X86_EFL_PF,
305 /* 0x56 = 01010110b */ X86_EFL_PF,
306 /* 0x57 = 01010111b */ 0,
307 /* 0x58 = 01011000b */ 0,
308 /* 0x59 = 01011001b */ X86_EFL_PF,
309 /* 0x5a = 01011010b */ X86_EFL_PF,
310 /* 0x5b = 01011011b */ 0,
311 /* 0x5c = 01011100b */ X86_EFL_PF,
312 /* 0x5d = 01011101b */ 0,
313 /* 0x5e = 01011110b */ 0,
314 /* 0x5f = 01011111b */ X86_EFL_PF,
315 /* 0x60 = 01100000b */ X86_EFL_PF,
316 /* 0x61 = 01100001b */ 0,
317 /* 0x62 = 01100010b */ 0,
318 /* 0x63 = 01100011b */ X86_EFL_PF,
319 /* 0x64 = 01100100b */ 0,
320 /* 0x65 = 01100101b */ X86_EFL_PF,
321 /* 0x66 = 01100110b */ X86_EFL_PF,
322 /* 0x67 = 01100111b */ 0,
323 /* 0x68 = 01101000b */ 0,
324 /* 0x69 = 01101001b */ X86_EFL_PF,
325 /* 0x6a = 01101010b */ X86_EFL_PF,
326 /* 0x6b = 01101011b */ 0,
327 /* 0x6c = 01101100b */ X86_EFL_PF,
328 /* 0x6d = 01101101b */ 0,
329 /* 0x6e = 01101110b */ 0,
330 /* 0x6f = 01101111b */ X86_EFL_PF,
331 /* 0x70 = 01110000b */ 0,
332 /* 0x71 = 01110001b */ X86_EFL_PF,
333 /* 0x72 = 01110010b */ X86_EFL_PF,
334 /* 0x73 = 01110011b */ 0,
335 /* 0x74 = 01110100b */ X86_EFL_PF,
336 /* 0x75 = 01110101b */ 0,
337 /* 0x76 = 01110110b */ 0,
338 /* 0x77 = 01110111b */ X86_EFL_PF,
339 /* 0x78 = 01111000b */ X86_EFL_PF,
340 /* 0x79 = 01111001b */ 0,
341 /* 0x7a = 01111010b */ 0,
342 /* 0x7b = 01111011b */ X86_EFL_PF,
343 /* 0x7c = 01111100b */ 0,
344 /* 0x7d = 01111101b */ X86_EFL_PF,
345 /* 0x7e = 01111110b */ X86_EFL_PF,
346 /* 0x7f = 01111111b */ 0,
347 /* 0x80 = 10000000b */ 0,
348 /* 0x81 = 10000001b */ X86_EFL_PF,
349 /* 0x82 = 10000010b */ X86_EFL_PF,
350 /* 0x83 = 10000011b */ 0,
351 /* 0x84 = 10000100b */ X86_EFL_PF,
352 /* 0x85 = 10000101b */ 0,
353 /* 0x86 = 10000110b */ 0,
354 /* 0x87 = 10000111b */ X86_EFL_PF,
355 /* 0x88 = 10001000b */ X86_EFL_PF,
356 /* 0x89 = 10001001b */ 0,
357 /* 0x8a = 10001010b */ 0,
358 /* 0x8b = 10001011b */ X86_EFL_PF,
359 /* 0x8c = 10001100b */ 0,
360 /* 0x8d = 10001101b */ X86_EFL_PF,
361 /* 0x8e = 10001110b */ X86_EFL_PF,
362 /* 0x8f = 10001111b */ 0,
363 /* 0x90 = 10010000b */ X86_EFL_PF,
364 /* 0x91 = 10010001b */ 0,
365 /* 0x92 = 10010010b */ 0,
366 /* 0x93 = 10010011b */ X86_EFL_PF,
367 /* 0x94 = 10010100b */ 0,
368 /* 0x95 = 10010101b */ X86_EFL_PF,
369 /* 0x96 = 10010110b */ X86_EFL_PF,
370 /* 0x97 = 10010111b */ 0,
371 /* 0x98 = 10011000b */ 0,
372 /* 0x99 = 10011001b */ X86_EFL_PF,
373 /* 0x9a = 10011010b */ X86_EFL_PF,
374 /* 0x9b = 10011011b */ 0,
375 /* 0x9c = 10011100b */ X86_EFL_PF,
376 /* 0x9d = 10011101b */ 0,
377 /* 0x9e = 10011110b */ 0,
378 /* 0x9f = 10011111b */ X86_EFL_PF,
379 /* 0xa0 = 10100000b */ X86_EFL_PF,
380 /* 0xa1 = 10100001b */ 0,
381 /* 0xa2 = 10100010b */ 0,
382 /* 0xa3 = 10100011b */ X86_EFL_PF,
383 /* 0xa4 = 10100100b */ 0,
384 /* 0xa5 = 10100101b */ X86_EFL_PF,
385 /* 0xa6 = 10100110b */ X86_EFL_PF,
386 /* 0xa7 = 10100111b */ 0,
387 /* 0xa8 = 10101000b */ 0,
388 /* 0xa9 = 10101001b */ X86_EFL_PF,
389 /* 0xaa = 10101010b */ X86_EFL_PF,
390 /* 0xab = 10101011b */ 0,
391 /* 0xac = 10101100b */ X86_EFL_PF,
392 /* 0xad = 10101101b */ 0,
393 /* 0xae = 10101110b */ 0,
394 /* 0xaf = 10101111b */ X86_EFL_PF,
395 /* 0xb0 = 10110000b */ 0,
396 /* 0xb1 = 10110001b */ X86_EFL_PF,
397 /* 0xb2 = 10110010b */ X86_EFL_PF,
398 /* 0xb3 = 10110011b */ 0,
399 /* 0xb4 = 10110100b */ X86_EFL_PF,
400 /* 0xb5 = 10110101b */ 0,
401 /* 0xb6 = 10110110b */ 0,
402 /* 0xb7 = 10110111b */ X86_EFL_PF,
403 /* 0xb8 = 10111000b */ X86_EFL_PF,
404 /* 0xb9 = 10111001b */ 0,
405 /* 0xba = 10111010b */ 0,
406 /* 0xbb = 10111011b */ X86_EFL_PF,
407 /* 0xbc = 10111100b */ 0,
408 /* 0xbd = 10111101b */ X86_EFL_PF,
409 /* 0xbe = 10111110b */ X86_EFL_PF,
410 /* 0xbf = 10111111b */ 0,
411 /* 0xc0 = 11000000b */ X86_EFL_PF,
412 /* 0xc1 = 11000001b */ 0,
413 /* 0xc2 = 11000010b */ 0,
414 /* 0xc3 = 11000011b */ X86_EFL_PF,
415 /* 0xc4 = 11000100b */ 0,
416 /* 0xc5 = 11000101b */ X86_EFL_PF,
417 /* 0xc6 = 11000110b */ X86_EFL_PF,
418 /* 0xc7 = 11000111b */ 0,
419 /* 0xc8 = 11001000b */ 0,
420 /* 0xc9 = 11001001b */ X86_EFL_PF,
421 /* 0xca = 11001010b */ X86_EFL_PF,
422 /* 0xcb = 11001011b */ 0,
423 /* 0xcc = 11001100b */ X86_EFL_PF,
424 /* 0xcd = 11001101b */ 0,
425 /* 0xce = 11001110b */ 0,
426 /* 0xcf = 11001111b */ X86_EFL_PF,
427 /* 0xd0 = 11010000b */ 0,
428 /* 0xd1 = 11010001b */ X86_EFL_PF,
429 /* 0xd2 = 11010010b */ X86_EFL_PF,
430 /* 0xd3 = 11010011b */ 0,
431 /* 0xd4 = 11010100b */ X86_EFL_PF,
432 /* 0xd5 = 11010101b */ 0,
433 /* 0xd6 = 11010110b */ 0,
434 /* 0xd7 = 11010111b */ X86_EFL_PF,
435 /* 0xd8 = 11011000b */ X86_EFL_PF,
436 /* 0xd9 = 11011001b */ 0,
437 /* 0xda = 11011010b */ 0,
438 /* 0xdb = 11011011b */ X86_EFL_PF,
439 /* 0xdc = 11011100b */ 0,
440 /* 0xdd = 11011101b */ X86_EFL_PF,
441 /* 0xde = 11011110b */ X86_EFL_PF,
442 /* 0xdf = 11011111b */ 0,
443 /* 0xe0 = 11100000b */ 0,
444 /* 0xe1 = 11100001b */ X86_EFL_PF,
445 /* 0xe2 = 11100010b */ X86_EFL_PF,
446 /* 0xe3 = 11100011b */ 0,
447 /* 0xe4 = 11100100b */ X86_EFL_PF,
448 /* 0xe5 = 11100101b */ 0,
449 /* 0xe6 = 11100110b */ 0,
450 /* 0xe7 = 11100111b */ X86_EFL_PF,
451 /* 0xe8 = 11101000b */ X86_EFL_PF,
452 /* 0xe9 = 11101001b */ 0,
453 /* 0xea = 11101010b */ 0,
454 /* 0xeb = 11101011b */ X86_EFL_PF,
455 /* 0xec = 11101100b */ 0,
456 /* 0xed = 11101101b */ X86_EFL_PF,
457 /* 0xee = 11101110b */ X86_EFL_PF,
458 /* 0xef = 11101111b */ 0,
459 /* 0xf0 = 11110000b */ X86_EFL_PF,
460 /* 0xf1 = 11110001b */ 0,
461 /* 0xf2 = 11110010b */ 0,
462 /* 0xf3 = 11110011b */ X86_EFL_PF,
463 /* 0xf4 = 11110100b */ 0,
464 /* 0xf5 = 11110101b */ X86_EFL_PF,
465 /* 0xf6 = 11110110b */ X86_EFL_PF,
466 /* 0xf7 = 11110111b */ 0,
467 /* 0xf8 = 11111000b */ 0,
468 /* 0xf9 = 11111001b */ X86_EFL_PF,
469 /* 0xfa = 11111010b */ X86_EFL_PF,
470 /* 0xfb = 11111011b */ 0,
471 /* 0xfc = 11111100b */ X86_EFL_PF,
472 /* 0xfd = 11111101b */ 0,
473 /* 0xfe = 11111110b */ 0,
474 /* 0xff = 11111111b */ X86_EFL_PF,
475};
476
477/* for clang: */
478extern const RTFLOAT32U g_ar32Zero[];
479extern const RTFLOAT64U g_ar64Zero[];
480extern const RTFLOAT80U g_ar80Zero[];
481extern const RTFLOAT32U g_ar32One[];
482extern const RTFLOAT80U g_ar80One[];
483extern const RTFLOAT80U g_r80Indefinite;
484extern const RTFLOAT32U g_ar32Infinity[];
485extern const RTFLOAT64U g_ar64Infinity[];
486extern const RTFLOAT80U g_ar80Infinity[];
487extern const RTFLOAT128U g_r128Ln2;
488extern const RTUINT128U g_u128Ln2Mantissa;
489extern const RTUINT128U g_u128Ln2MantissaIntel;
490extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
491extern const RTFLOAT32U g_ar32QNaN[];
492extern const RTFLOAT64U g_ar64QNaN[];
493
494/** Zero values (indexed by fSign). */
495RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
496RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
497RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
498
499/** One values (indexed by fSign). */
500RTFLOAT32U const g_ar32One[] =
501{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
502RTFLOAT80U const g_ar80One[] =
503{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
504
505/** Indefinite (negative). */
506RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
507
508/** Infinities (indexed by fSign). */
509RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
510RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
511RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
512
513/** Default QNaNs (indexed by fSign). */
514RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
515RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
516
517
518#if 0
519/** 128-bit floating point constant: 2.0 */
520const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
521#endif
522
523
524/* The next section is generated by tools/IEMGenFpuConstants: */
525
526/** The ln2 constant as 128-bit floating point value.
527 * base-10: 6.93147180559945309417232121458176575e-1
528 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
529 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
530//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
531const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
532/** High precision ln2 value.
533 * base-10: 6.931471805599453094172321214581765680747e-1
534 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
535 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
536const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
537/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
538 * base-10: 6.931471805599453094151379470289064954613e-1
539 * base-16: b.17217f7d1cf79abc0000000000000000@-1
540 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
541const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
542
543/** Horner constants for f2xm1 */
544const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
545{
546 /* a0
547 * base-10: 1.00000000000000000000000000000000000e0
548 * base-16: 1.0000000000000000000000000000@0
549 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
550 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
551 /* a1
552 * base-10: 5.00000000000000000000000000000000000e-1
553 * base-16: 8.0000000000000000000000000000@-1
554 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
555 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
556 /* a2
557 * base-10: 1.66666666666666666666666666666666658e-1
558 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
559 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
560 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
561 /* a3
562 * base-10: 4.16666666666666666666666666666666646e-2
563 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
564 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
565 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
566 /* a4
567 * base-10: 8.33333333333333333333333333333333323e-3
568 * base-16: 2.2222222222222222222222222222@-2
569 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
570 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
571 /* a5
572 * base-10: 1.38888888888888888888888888888888874e-3
573 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
574 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
575 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
576 /* a6
577 * base-10: 1.98412698412698412698412698412698412e-4
578 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
579 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
580 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
581 /* a7
582 * base-10: 2.48015873015873015873015873015873015e-5
583 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
584 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
585 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
586 /* a8
587 * base-10: 2.75573192239858906525573192239858902e-6
588 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
589 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
590 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
591 /* a9
592 * base-10: 2.75573192239858906525573192239858865e-7
593 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
594 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
595 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
596 /* a10
597 * base-10: 2.50521083854417187750521083854417184e-8
598 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
599 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
600 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
601 /* a11
602 * base-10: 2.08767569878680989792100903212014296e-9
603 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
604 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
605 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
606 /* a12
607 * base-10: 1.60590438368216145993923771701549472e-10
608 * base-16: b.092309d43684be51c198e91d7b40@-9
609 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
610 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
611 /* a13
612 * base-10: 1.14707455977297247138516979786821043e-11
613 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
614 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
615 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
616 /* a14
617 * base-10: 7.64716373181981647590113198578806964e-13
618 * base-16: d.73f9f399dc0f88ec32b587746578@-11
619 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
620 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
621 /* a15
622 * base-10: 4.77947733238738529743820749111754352e-14
623 * base-16: d.73f9f399dc0f88ec32b587746578@-12
624 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
625 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
626 /* a16
627 * base-10: 2.81145725434552076319894558301031970e-15
628 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
629 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
630 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
631 /* a17
632 * base-10: 1.56192069685862264622163643500573321e-16
633 * base-16: b.413c31dcbecbbdd8024435161550@-14
634 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
635 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
636 /* a18
637 * base-10: 8.22063524662432971695598123687227980e-18
638 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
639 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
640 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
641 /* a19
642 * base-10: 4.11031762331216485847799061843614006e-19
643 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
644 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
645 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
646 /* a20
647 * base-10: 1.95729410633912612308475743735054143e-20
648 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
649 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
650 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
651 /* a21
652 * base-10: 8.89679139245057328674889744250246106e-22
653 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
654 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
655 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
656};
657
658
659/*
660 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
661 * it all in C is probably safer atm., optimize what's necessary later, maybe.
662 */
663#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
664
665
666/*********************************************************************************************************************************
667* Binary Operations *
668*********************************************************************************************************************************/
669
670/*
671 * ADD
672 */
673
674IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
675{
676 uint64_t uDst = *puDst;
677 uint64_t uResult = uDst + uSrc;
678 *puDst = uResult;
679 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
680 return fEFlags;
681}
682
683# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
684
685IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
686{
687 uint32_t uDst = *puDst;
688 uint32_t uResult = uDst + uSrc;
689 *puDst = uResult;
690 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
691 return fEFlags;
692}
693
694
695IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
696{
697 uint16_t uDst = *puDst;
698 uint16_t uResult = uDst + uSrc;
699 *puDst = uResult;
700 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
701 return fEFlags;
702}
703
704
705IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
706{
707 uint8_t uDst = *puDst;
708 uint8_t uResult = uDst + uSrc;
709 *puDst = uResult;
710 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
711 return fEFlags;
712}
713
714# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
715
716/*
717 * ADC
718 */
719
720IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
721{
722 if (!(fEFlags & X86_EFL_CF))
723 fEFlags = iemAImpl_add_u64(fEFlags, puDst, uSrc);
724 else
725 {
726 uint64_t uDst = *puDst;
727 uint64_t uResult = uDst + uSrc + 1;
728 *puDst = uResult;
729 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
730 }
731 return fEFlags;
732}
733
734# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
735
736IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
737{
738 if (!(fEFlags & X86_EFL_CF))
739 fEFlags = iemAImpl_add_u32(fEFlags, puDst, uSrc);
740 else
741 {
742 uint32_t uDst = *puDst;
743 uint32_t uResult = uDst + uSrc + 1;
744 *puDst = uResult;
745 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
746 }
747 return fEFlags;
748}
749
750
751IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
752{
753 if (!(fEFlags & X86_EFL_CF))
754 fEFlags = iemAImpl_add_u16(fEFlags, puDst, uSrc);
755 else
756 {
757 uint16_t uDst = *puDst;
758 uint16_t uResult = uDst + uSrc + 1;
759 *puDst = uResult;
760 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
761 }
762 return fEFlags;
763}
764
765
766IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
767{
768 if (!(fEFlags & X86_EFL_CF))
769 fEFlags = iemAImpl_add_u8(fEFlags, puDst, uSrc);
770 else
771 {
772 uint8_t uDst = *puDst;
773 uint8_t uResult = uDst + uSrc + 1;
774 *puDst = uResult;
775 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
776 }
777 return fEFlags;
778}
779
780# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
781
782/*
783 * SUB
784 */
785# if !defined(RT_ARCH_ARM64)
786
787IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
788{
789 uint64_t uDst = *puDst;
790 uint64_t uResult = uDst - uSrc;
791 *puDst = uResult;
792 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
793 return fEFlags;
794}
795
796# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
797
798IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
799{
800 uint32_t uDst = *puDst;
801 uint32_t uResult = uDst - uSrc;
802 *puDst = uResult;
803 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
804 return fEFlags;
805}
806
807
808IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
809{
810 uint16_t uDst = *puDst;
811 uint16_t uResult = uDst - uSrc;
812 *puDst = uResult;
813 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
814 return fEFlags;
815}
816
817
818IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
819{
820 uint8_t uDst = *puDst;
821 uint8_t uResult = uDst - uSrc;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
824 return fEFlags;
825}
826
827# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
828# endif /* !RT_ARCH_ARM64 */
829
830/*
831 * SBB
832 */
833
834IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
835{
836 if (!(fEFlags & X86_EFL_CF))
837 fEFlags = iemAImpl_sub_u64(fEFlags, puDst, uSrc);
838 else
839 {
840 uint64_t uDst = *puDst;
841 uint64_t uResult = uDst - uSrc - 1;
842 *puDst = uResult;
843 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
844 }
845 return fEFlags;
846}
847
848# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
849
850IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
851{
852 if (!(fEFlags & X86_EFL_CF))
853 fEFlags = iemAImpl_sub_u32(fEFlags, puDst, uSrc);
854 else
855 {
856 uint32_t uDst = *puDst;
857 uint32_t uResult = uDst - uSrc - 1;
858 *puDst = uResult;
859 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
860 }
861 return fEFlags;
862}
863
864
865IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
866{
867 if (!(fEFlags & X86_EFL_CF))
868 fEFlags = iemAImpl_sub_u16(fEFlags, puDst, uSrc);
869 else
870 {
871 uint16_t uDst = *puDst;
872 uint16_t uResult = uDst - uSrc - 1;
873 *puDst = uResult;
874 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
875 }
876 return fEFlags;
877}
878
879
880IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
881{
882 if (!(fEFlags & X86_EFL_CF))
883 fEFlags = iemAImpl_sub_u8(fEFlags, puDst, uSrc);
884 else
885 {
886 uint8_t uDst = *puDst;
887 uint8_t uResult = uDst - uSrc - 1;
888 *puDst = uResult;
889 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
890 }
891 return fEFlags;
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896
897/*
898 * OR
899 */
900
901IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
902{
903 uint64_t uResult = *puDst | uSrc;
904 *puDst = uResult;
905 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
906 return fEFlags;
907}
908
909# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
910
911IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
912{
913 uint32_t uResult = *puDst | uSrc;
914 *puDst = uResult;
915 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
916 return fEFlags;
917}
918
919
920IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
921{
922 uint16_t uResult = *puDst | uSrc;
923 *puDst = uResult;
924 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
925 return fEFlags;
926}
927
928
929IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
930{
931 uint8_t uResult = *puDst | uSrc;
932 *puDst = uResult;
933 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
934 return fEFlags;
935}
936
937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
938
939/*
940 * XOR
941 */
942
943IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
944{
945 uint64_t uResult = *puDst ^ uSrc;
946 *puDst = uResult;
947 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
948 return fEFlags;
949}
950
951# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
952
953IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
954{
955 uint32_t uResult = *puDst ^ uSrc;
956 *puDst = uResult;
957 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
958 return fEFlags;
959}
960
961
962IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
963{
964 uint16_t uResult = *puDst ^ uSrc;
965 *puDst = uResult;
966 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
967 return fEFlags;
968}
969
970
971IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
972{
973 uint8_t uResult = *puDst ^ uSrc;
974 *puDst = uResult;
975 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
976 return fEFlags;
977}
978
979# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
980
981/*
982 * AND
983 */
984
985IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
986{
987 uint64_t const uResult = *puDst & uSrc;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
990 return fEFlags;
991}
992
993# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994
995IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
996{
997 uint32_t const uResult = *puDst & uSrc;
998 *puDst = uResult;
999 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1000 return fEFlags;
1001}
1002
1003
1004IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1005{
1006 uint16_t const uResult = *puDst & uSrc;
1007 *puDst = uResult;
1008 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
1009 return fEFlags;
1010}
1011
1012
1013IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
1014{
1015 uint8_t const uResult = *puDst & uSrc;
1016 *puDst = uResult;
1017 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
1018 return fEFlags;
1019}
1020
1021# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1022#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1023
1024/*
1025 * ANDN (BMI1 instruction)
1026 */
1027
1028IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1029{
1030 uint64_t const uResult = ~uSrc1 & uSrc2;
1031 *puDst = uResult;
1032 uint32_t fEFlags = *pfEFlags;
1033 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
1034 *pfEFlags = fEFlags;
1035}
1036
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1039{
1040 uint32_t const uResult = ~uSrc1 & uSrc2;
1041 *puDst = uResult;
1042 uint32_t fEFlags = *pfEFlags;
1043 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1044 *pfEFlags = fEFlags;
1045}
1046
1047
1048#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1049IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1050{
1051 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1052}
1053#endif
1054
1055
1056#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1057IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1058{
1059 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1060}
1061#endif
1062
1063#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1064
1065/*
1066 * CMP
1067 */
1068
1069IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u64,(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc))
1070{
1071 uint64_t uDstTmp = *puDst;
1072 return iemAImpl_sub_u64(fEFlags, &uDstTmp, uSrc);
1073}
1074
1075# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1076
1077IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u32,(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc))
1078{
1079 uint32_t uDstTmp = *puDst;
1080 return iemAImpl_sub_u32(fEFlags, &uDstTmp, uSrc);
1081}
1082
1083
1084IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u16,(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc))
1085{
1086 uint16_t uDstTmp = *puDst;
1087 return iemAImpl_sub_u16(fEFlags, &uDstTmp, uSrc);
1088}
1089
1090
1091IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u8,(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc))
1092{
1093 uint8_t uDstTmp = *puDst;
1094 return iemAImpl_sub_u8(fEFlags, &uDstTmp, uSrc);
1095}
1096
1097# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1098
1099/*
1100 * TEST
1101 */
1102
1103IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u64,(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc))
1104{
1105 uint64_t uResult = *puDst & uSrc;
1106 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
1107 return fEFlags;
1108}
1109
1110# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1111
1112IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u32,(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc))
1113{
1114 uint32_t uResult = *puDst & uSrc;
1115 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1116 return fEFlags;
1117}
1118
1119
1120IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u16,(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc))
1121{
1122 uint16_t uResult = *puDst & uSrc;
1123 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
1124 return fEFlags;
1125}
1126
1127
1128IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u8,(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc))
1129{
1130 uint8_t uResult = *puDst & uSrc;
1131 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
1132 return fEFlags;
1133}
1134
1135# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1136
1137
1138/*
1139 * LOCK prefixed variants of the above
1140 */
1141
1142/** 64-bit locked binary operand operation. */
1143# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1144 do { \
1145 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1146 uint ## a_cBitsWidth ## _t uTmp; \
1147 uint32_t fEflTmp; \
1148 do \
1149 { \
1150 uTmp = uOld; \
1151 fEflTmp = iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(fEFlagsIn, &uTmp, uSrc); \
1152 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1153 return fEflTmp; \
1154 } while (0)
1155
1156
1157#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1158 IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint32_t fEFlagsIn, \
1159 uint ## a_cBitsWidth ## _t *puDst, \
1160 uint ## a_cBitsWidth ## _t uSrc)) \
1161 { \
1162 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1163 }
1164
1165EMIT_LOCKED_BIN_OP(add, 64)
1166EMIT_LOCKED_BIN_OP(adc, 64)
1167EMIT_LOCKED_BIN_OP(sub, 64)
1168EMIT_LOCKED_BIN_OP(sbb, 64)
1169EMIT_LOCKED_BIN_OP(or, 64)
1170EMIT_LOCKED_BIN_OP(xor, 64)
1171EMIT_LOCKED_BIN_OP(and, 64)
1172# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1173EMIT_LOCKED_BIN_OP(add, 32)
1174EMIT_LOCKED_BIN_OP(adc, 32)
1175EMIT_LOCKED_BIN_OP(sub, 32)
1176EMIT_LOCKED_BIN_OP(sbb, 32)
1177EMIT_LOCKED_BIN_OP(or, 32)
1178EMIT_LOCKED_BIN_OP(xor, 32)
1179EMIT_LOCKED_BIN_OP(and, 32)
1180
1181EMIT_LOCKED_BIN_OP(add, 16)
1182EMIT_LOCKED_BIN_OP(adc, 16)
1183EMIT_LOCKED_BIN_OP(sub, 16)
1184EMIT_LOCKED_BIN_OP(sbb, 16)
1185EMIT_LOCKED_BIN_OP(or, 16)
1186EMIT_LOCKED_BIN_OP(xor, 16)
1187EMIT_LOCKED_BIN_OP(and, 16)
1188
1189EMIT_LOCKED_BIN_OP(add, 8)
1190EMIT_LOCKED_BIN_OP(adc, 8)
1191EMIT_LOCKED_BIN_OP(sub, 8)
1192EMIT_LOCKED_BIN_OP(sbb, 8)
1193EMIT_LOCKED_BIN_OP(or, 8)
1194EMIT_LOCKED_BIN_OP(xor, 8)
1195EMIT_LOCKED_BIN_OP(and, 8)
1196# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1197
1198
1199/*
1200 * Bit operations (same signature as above).
1201 */
1202
1203/*
1204 * BT
1205 */
1206
1207IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bt_u64,(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc))
1208{
1209 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1210 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1211 Assert(uSrc < 64);
1212 uint64_t uDst = *puDst;
1213 if (uDst & RT_BIT_64(uSrc))
1214 fEFlags |= X86_EFL_CF;
1215 else
1216 fEFlags &= ~X86_EFL_CF;
1217 return fEFlags;
1218}
1219
1220# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1221
1222IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bt_u32,(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc))
1223{
1224 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1225 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1226 Assert(uSrc < 32);
1227 uint32_t uDst = *puDst;
1228 if (uDst & RT_BIT_32(uSrc))
1229 fEFlags |= X86_EFL_CF;
1230 else
1231 fEFlags &= ~X86_EFL_CF;
1232 return fEFlags;
1233}
1234
1235IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bt_u16,(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc))
1236{
1237 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1238 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1239 Assert(uSrc < 16);
1240 uint16_t uDst = *puDst;
1241 if (uDst & RT_BIT_32(uSrc))
1242 fEFlags |= X86_EFL_CF;
1243 else
1244 fEFlags &= ~X86_EFL_CF;
1245 return fEFlags;
1246}
1247
1248# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1249
1250/*
1251 * BTC
1252 */
1253
1254IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btc_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1255{
1256 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1257 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1258 Assert(uSrc < 64);
1259 uint64_t fMask = RT_BIT_64(uSrc);
1260 uint64_t uDst = *puDst;
1261 if (uDst & fMask)
1262 {
1263 uDst &= ~fMask;
1264 *puDst = uDst;
1265 fEFlags |= X86_EFL_CF;
1266 }
1267 else
1268 {
1269 uDst |= fMask;
1270 *puDst = uDst;
1271 fEFlags &= ~X86_EFL_CF;
1272 }
1273 return fEFlags;
1274}
1275
1276# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1277
1278IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btc_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1279{
1280 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1281 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1282 Assert(uSrc < 32);
1283 uint32_t fMask = RT_BIT_32(uSrc);
1284 uint32_t uDst = *puDst;
1285 if (uDst & fMask)
1286 {
1287 uDst &= ~fMask;
1288 *puDst = uDst;
1289 fEFlags |= X86_EFL_CF;
1290 }
1291 else
1292 {
1293 uDst |= fMask;
1294 *puDst = uDst;
1295 fEFlags &= ~X86_EFL_CF;
1296 }
1297 return fEFlags;
1298}
1299
1300
1301IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btc_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1302{
1303 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1304 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1305 Assert(uSrc < 16);
1306 uint16_t fMask = RT_BIT_32(uSrc);
1307 uint16_t uDst = *puDst;
1308 if (uDst & fMask)
1309 {
1310 uDst &= ~fMask;
1311 *puDst = uDst;
1312 fEFlags |= X86_EFL_CF;
1313 }
1314 else
1315 {
1316 uDst |= fMask;
1317 *puDst = uDst;
1318 fEFlags &= ~X86_EFL_CF;
1319 }
1320 return fEFlags;
1321}
1322
1323# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1324
1325/*
1326 * BTR
1327 */
1328
1329IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btr_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1330{
1331 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1332 logical operation (AND/OR/whatever). */
1333 Assert(uSrc < 64);
1334 uint64_t fMask = RT_BIT_64(uSrc);
1335 uint64_t uDst = *puDst;
1336 if (uDst & fMask)
1337 {
1338 uDst &= ~fMask;
1339 *puDst = uDst;
1340 fEFlags |= X86_EFL_CF;
1341 }
1342 else
1343 fEFlags &= ~X86_EFL_CF;
1344 return fEFlags;
1345}
1346
1347# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1348
1349IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btr_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1350{
1351 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1352 logical operation (AND/OR/whatever). */
1353 Assert(uSrc < 32);
1354 uint32_t fMask = RT_BIT_32(uSrc);
1355 uint32_t uDst = *puDst;
1356 if (uDst & fMask)
1357 {
1358 uDst &= ~fMask;
1359 *puDst = uDst;
1360 fEFlags |= X86_EFL_CF;
1361 }
1362 else
1363 fEFlags &= ~X86_EFL_CF;
1364 return fEFlags;
1365}
1366
1367
1368IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_btr_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1369{
1370 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1371 logical operation (AND/OR/whatever). */
1372 Assert(uSrc < 16);
1373 uint16_t fMask = RT_BIT_32(uSrc);
1374 uint16_t uDst = *puDst;
1375 if (uDst & fMask)
1376 {
1377 uDst &= ~fMask;
1378 *puDst = uDst;
1379 fEFlags |= X86_EFL_CF;
1380 }
1381 else
1382 fEFlags &= ~X86_EFL_CF;
1383 return fEFlags;
1384}
1385
1386# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1387
1388/*
1389 * BTS
1390 */
1391
1392IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bts_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1393{
1394 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1395 logical operation (AND/OR/whatever). */
1396 Assert(uSrc < 64);
1397 uint64_t fMask = RT_BIT_64(uSrc);
1398 uint64_t uDst = *puDst;
1399 if (uDst & fMask)
1400 fEFlags |= X86_EFL_CF;
1401 else
1402 {
1403 uDst |= fMask;
1404 *puDst = uDst;
1405 fEFlags &= ~X86_EFL_CF;
1406 }
1407 return fEFlags;
1408}
1409
1410# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1411
1412IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bts_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1413{
1414 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1415 logical operation (AND/OR/whatever). */
1416 Assert(uSrc < 32);
1417 uint32_t fMask = RT_BIT_32(uSrc);
1418 uint32_t uDst = *puDst;
1419 if (uDst & fMask)
1420 fEFlags |= X86_EFL_CF;
1421 else
1422 {
1423 uDst |= fMask;
1424 *puDst = uDst;
1425 fEFlags &= ~X86_EFL_CF;
1426 }
1427 return fEFlags;
1428}
1429
1430
1431IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bts_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1432{
1433 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1434 logical operation (AND/OR/whatever). */
1435 Assert(uSrc < 16);
1436 uint16_t fMask = RT_BIT_32(uSrc);
1437 uint32_t uDst = *puDst;
1438 if (uDst & fMask)
1439 fEFlags |= X86_EFL_CF;
1440 else
1441 {
1442 uDst |= fMask;
1443 *puDst = uDst;
1444 fEFlags &= ~X86_EFL_CF;
1445 }
1446 return fEFlags;
1447}
1448
1449# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1450
1451EMIT_LOCKED_BIN_OP(btc, 64)
1452EMIT_LOCKED_BIN_OP(btr, 64)
1453EMIT_LOCKED_BIN_OP(bts, 64)
1454# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1455EMIT_LOCKED_BIN_OP(btc, 32)
1456EMIT_LOCKED_BIN_OP(btr, 32)
1457EMIT_LOCKED_BIN_OP(bts, 32)
1458
1459EMIT_LOCKED_BIN_OP(btc, 16)
1460EMIT_LOCKED_BIN_OP(btr, 16)
1461EMIT_LOCKED_BIN_OP(bts, 16)
1462# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1463
1464#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1465
1466/*
1467 * Helpers for BSR and BSF.
1468 *
1469 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1470 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1471 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1472 * but we restrict ourselves to emulating these recent marchs.
1473 */
1474#define SET_BIT_SEARCH_RESULT_INTEL(a_puDst, a_fEFlagsVar, a_iBit) do { \
1475 unsigned iBit = (a_iBit); \
1476 a_fEFlagsVar &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1477 if (iBit) \
1478 { \
1479 *(a_puDst) = --iBit; \
1480 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(iBit); \
1481 } \
1482 else \
1483 a_fEFlagsVar |= X86_EFL_ZF | X86_EFL_PF; \
1484 } while (0)
1485#define SET_BIT_SEARCH_RESULT_AMD(a_puDst, a_fEFlagsVar, a_iBit) do { \
1486 unsigned const iBit = (a_iBit); \
1487 if (iBit) \
1488 { \
1489 *(a_puDst) = iBit - 1; \
1490 a_fEFlagsVar &= ~X86_EFL_ZF; \
1491 } \
1492 else \
1493 a_fEFlagsVar |= X86_EFL_ZF; \
1494 } while (0)
1495
1496/*
1497 * BSF - first (least significant) bit set
1498 */
1499#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1500IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1501{
1502 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU64(uSrc));
1503 return fEFlags;
1504}
1505#endif
1506
1507IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u64_intel,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU64(uSrc));
1510 return fEFlags;
1511}
1512
1513IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u64_amd,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1514{
1515 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitFirstSetU64(uSrc));
1516 return fEFlags;
1517}
1518
1519#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1520IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1521{
1522 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU32(uSrc));
1523 return fEFlags;
1524}
1525#endif
1526
1527IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u32_intel,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1528{
1529 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU32(uSrc));
1530 return fEFlags;
1531}
1532
1533IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u32_amd,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1534{
1535 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitFirstSetU32(uSrc));
1536 return fEFlags;
1537}
1538
1539
1540#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1541IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1542{
1543 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU16(uSrc));
1544 return fEFlags;
1545}
1546#endif
1547
1548IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u16_intel,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1549{
1550 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitFirstSetU16(uSrc));
1551 return fEFlags;
1552}
1553
1554IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsf_u16_amd,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1555{
1556 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitFirstSetU16(uSrc));
1557 return fEFlags;
1558}
1559
1560
1561
1562/*
1563 * BSR - last (most significant) bit set
1564 */
1565#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1566IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1567{
1568 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU64(uSrc));
1569 return fEFlags;
1570}
1571#endif
1572
1573IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u64_intel,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1574{
1575 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU64(uSrc));
1576 return fEFlags;
1577}
1578
1579IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u64_amd,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1580{
1581 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitLastSetU64(uSrc));
1582 return fEFlags;
1583}
1584
1585
1586#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1587IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1588{
1589 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU32(uSrc));
1590 return fEFlags;
1591}
1592#endif
1593
1594IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u32_intel,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1595{
1596 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU32(uSrc));
1597 return fEFlags;
1598}
1599
1600IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u32_amd,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1601{
1602 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitLastSetU32(uSrc));
1603 return fEFlags;
1604}
1605
1606
1607#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1608IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1609{
1610 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU16(uSrc));
1611 return fEFlags;
1612}
1613#endif
1614
1615IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u16_intel,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1616{
1617 SET_BIT_SEARCH_RESULT_INTEL(puDst, fEFlags, ASMBitLastSetU16(uSrc));
1618 return fEFlags;
1619}
1620
1621IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_bsr_u16_amd,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1622{
1623 SET_BIT_SEARCH_RESULT_AMD(puDst, fEFlags, ASMBitLastSetU16(uSrc));
1624 return fEFlags;
1625}
1626
1627
1628/*
1629 * Helpers for LZCNT and TZCNT.
1630 */
1631#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_fEFlagsVar, a_uResult) do { \
1632 unsigned const uResult = (a_uResult); \
1633 *(a_puDst) = uResult; \
1634 a_fEFlagsVar &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1635 if (uResult) \
1636 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(uResult); \
1637 else \
1638 a_fEFlagsVar |= X86_EFL_ZF | X86_EFL_PF; \
1639 if (!a_uSrc) \
1640 a_fEFlagsVar |= X86_EFL_CF; \
1641 } while (0)
1642#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_fEFlagsVar, a_uResult) do { \
1643 unsigned const uResult = (a_uResult); \
1644 *(a_puDst) = uResult; \
1645 a_fEFlagsVar &= ~(X86_EFL_ZF | X86_EFL_CF); \
1646 if (!uResult) \
1647 a_fEFlagsVar |= X86_EFL_ZF; \
1648 if (!a_uSrc) \
1649 a_fEFlagsVar |= X86_EFL_CF; \
1650 } while (0)
1651
1652
1653/*
1654 * LZCNT - count leading zero bits.
1655 */
1656#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1657IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1658{
1659 return iemAImpl_lzcnt_u64_intel(fEFlags, puDst, uSrc);
1660}
1661#endif
1662
1663IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u64_intel,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountLeadingZerosU64(uSrc));
1666 return fEFlags;
1667}
1668
1669IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u64_amd,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1670{
1671 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountLeadingZerosU64(uSrc));
1672 return fEFlags;
1673}
1674
1675
1676#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1677IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1678{
1679 return iemAImpl_lzcnt_u32_intel(fEFlags, puDst, uSrc);
1680}
1681#endif
1682
1683IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u32_intel,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1684{
1685 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountLeadingZerosU32(uSrc));
1686 return fEFlags;
1687}
1688
1689IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u32_amd,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1690{
1691 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountLeadingZerosU32(uSrc));
1692 return fEFlags;
1693}
1694
1695
1696#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1697IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1698{
1699 return iemAImpl_lzcnt_u16_intel(fEFlags, puDst, uSrc);
1700}
1701#endif
1702
1703IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u16_intel,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1704{
1705 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountLeadingZerosU16(uSrc));
1706 return fEFlags;
1707}
1708
1709IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_lzcnt_u16_amd,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1710{
1711 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountLeadingZerosU16(uSrc));
1712 return fEFlags;
1713}
1714
1715
1716/*
1717 * TZCNT - count leading zero bits.
1718 */
1719#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1720IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1721{
1722 return iemAImpl_tzcnt_u64_intel(fEFlags, puDst, uSrc);
1723}
1724#endif
1725
1726IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u64_intel,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1727{
1728 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountTrailingZerosU64(uSrc));
1729 return fEFlags;
1730}
1731
1732IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u64_amd,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
1733{
1734 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountTrailingZerosU64(uSrc));
1735 return fEFlags;
1736}
1737
1738
1739#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1740IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1741{
1742 return iemAImpl_tzcnt_u32_intel(fEFlags, puDst, uSrc);
1743}
1744#endif
1745
1746IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u32_intel,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1747{
1748 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountTrailingZerosU32(uSrc));
1749 return fEFlags;
1750}
1751
1752IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u32_amd,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
1753{
1754 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountTrailingZerosU32(uSrc));
1755 return fEFlags;
1756}
1757
1758
1759#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1760IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1761{
1762 return iemAImpl_tzcnt_u16_intel(fEFlags, puDst, uSrc);
1763}
1764#endif
1765
1766IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u16_intel,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1767{
1768 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, fEFlags, ASMCountTrailingZerosU16(uSrc));
1769 return fEFlags;
1770}
1771
1772IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_tzcnt_u16_amd,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1773{
1774 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, fEFlags, ASMCountTrailingZerosU16(uSrc));
1775 return fEFlags;
1776}
1777
1778
1779
1780/*
1781 * BEXTR (BMI1 instruction)
1782 */
1783#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1784IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1785 a_Type uSrc2, uint32_t *pfEFlags)) \
1786{ \
1787 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1788 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1789 a_Type uResult; \
1790 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1791 if (iFirstBit < a_cBits) \
1792 { \
1793 uResult = uSrc1 >> iFirstBit; \
1794 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1795 if (cBits < a_cBits) \
1796 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1797 *puDst = uResult; \
1798 if (!uResult) \
1799 fEfl |= X86_EFL_ZF; \
1800 } \
1801 else \
1802 { \
1803 *puDst = uResult = 0; \
1804 fEfl |= X86_EFL_ZF; \
1805 } \
1806 /** @todo complete flag calculations. */ \
1807 *pfEFlags = fEfl; \
1808}
1809
1810EMIT_BEXTR(64, uint64_t, _fallback)
1811EMIT_BEXTR(32, uint32_t, _fallback)
1812#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1813EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1814#endif
1815#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1816EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1817#endif
1818
1819/*
1820 * BLSR (BMI1 instruction)
1821 */
1822#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1823IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(uint32_t fEFlags, a_Type *puDst, a_Type uSrc)) \
1824{ \
1825 *puDst = uSrc; \
1826 uint32_t fEfl1 = iemAImpl_sub_u ## a_cBits(fEFlags, &uSrc, 1); \
1827 uint32_t fEfl2 = iemAImpl_and_u ## a_cBits(fEFlags, puDst, uSrc); \
1828 \
1829 /* AMD: The carry flag is from the SUB operation. */ \
1830 /* 10890xe: PF always cleared? */ \
1831 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1832 fEfl2 |= fEfl1 & X86_EFL_CF; \
1833 return fEfl2; \
1834}
1835
1836EMIT_BLSR(64, uint64_t, _fallback)
1837EMIT_BLSR(32, uint32_t, _fallback)
1838#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1839EMIT_BLSR(64, uint64_t, RT_NOTHING)
1840#endif
1841#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1842EMIT_BLSR(32, uint32_t, RT_NOTHING)
1843#endif
1844
1845/*
1846 * BLSMSK (BMI1 instruction)
1847 */
1848#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1849IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(uint32_t fEFlags, a_Type *puDst, a_Type uSrc)) \
1850{ \
1851 *puDst = uSrc; \
1852 uint32_t fEfl1 = iemAImpl_sub_u ## a_cBits(fEFlags, &uSrc, 1); \
1853 uint32_t fEfl2 = iemAImpl_xor_u ## a_cBits(fEFlags, puDst, uSrc); \
1854 \
1855 /* AMD: The carry flag is from the SUB operation. */ \
1856 /* 10890xe: PF always cleared? */ \
1857 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1858 fEfl2 |= fEfl1 & X86_EFL_CF; \
1859 return fEfl2; \
1860}
1861
1862EMIT_BLSMSK(64, uint64_t, _fallback)
1863EMIT_BLSMSK(32, uint32_t, _fallback)
1864#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1865EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1866#endif
1867#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1868EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1869#endif
1870
1871/*
1872 * BLSI (BMI1 instruction)
1873 */
1874#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1875IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(uint32_t fEFlags, a_Type *puDst, a_Type uSrc)) \
1876{ \
1877 uint32_t fEfl1 = fEFlags; \
1878 *puDst = uSrc; \
1879 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1880 uint32_t fEfl2 = iemAImpl_and_u ## a_cBits(fEFlags, puDst, uSrc); \
1881 \
1882 /* AMD: The carry flag is from the SUB operation. */ \
1883 /* 10890xe: PF always cleared? */ \
1884 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1885 fEfl2 |= fEfl1 & X86_EFL_CF; \
1886 return fEfl2; \
1887}
1888
1889EMIT_BLSI(64, uint64_t, _fallback)
1890EMIT_BLSI(32, uint32_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_BLSI(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_BLSI(32, uint32_t, RT_NOTHING)
1896#endif
1897
1898/*
1899 * BZHI (BMI2 instruction)
1900 */
1901#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1902IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1903 a_Type uSrc2, uint32_t *pfEFlags)) \
1904{ \
1905 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1906 a_Type uResult; \
1907 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1908 if (iFirstBit < a_cBits) \
1909 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1910 else \
1911 { \
1912 uResult = uSrc1; \
1913 fEfl |= X86_EFL_CF; \
1914 } \
1915 *puDst = uResult; \
1916 fEfl |= X86_EFL_CALC_ZF(uResult); \
1917 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1918 *pfEFlags = fEfl; \
1919}
1920
1921EMIT_BZHI(64, uint64_t, _fallback)
1922EMIT_BZHI(32, uint32_t, _fallback)
1923#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1924EMIT_BZHI(64, uint64_t, RT_NOTHING)
1925#endif
1926#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1927EMIT_BZHI(32, uint32_t, RT_NOTHING)
1928#endif
1929
1930/*
1931 * POPCNT
1932 */
1933RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1934{
1935 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1936 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1937 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1938 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1939};
1940
1941/** @todo Use native popcount where possible and employ some more efficient
1942 * algorithm here (or in asm.h fallback)! */
1943
1944DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1945{
1946 return g_abBitCounts6[ u16 & 0x3f]
1947 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1948 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1949}
1950
1951DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1952{
1953 return g_abBitCounts6[ u32 & 0x3f]
1954 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1955 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1956 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1957 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1958 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1959}
1960
1961DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1962{
1963 return g_abBitCounts6[ u64 & 0x3f]
1964 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1965 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1966 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1967 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1968 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1969 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1970 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1971 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1972 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1973 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1974}
1975
1976#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1977IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(uint32_t fEFlags, a_Type *puDst, a_Type uSrc)) \
1978{ \
1979 fEFlags &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1980 a_Type uResult; \
1981 if (uSrc) \
1982 uResult = iemPopCountU ## a_cBits(uSrc); \
1983 else \
1984 { \
1985 fEFlags |= X86_EFL_ZF; \
1986 uResult = 0; \
1987 } \
1988 *puDst = uResult; \
1989 return fEFlags; \
1990}
1991
1992EMIT_POPCNT(64, uint64_t, _fallback)
1993EMIT_POPCNT(32, uint32_t, _fallback)
1994EMIT_POPCNT(16, uint16_t, _fallback)
1995#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1996EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1997#endif
1998#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1999EMIT_POPCNT(32, uint32_t, RT_NOTHING)
2000EMIT_POPCNT(16, uint16_t, RT_NOTHING)
2001#endif
2002
2003
2004#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2005
2006/*
2007 * XCHG
2008 */
2009
2010IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
2011{
2012#if ARCH_BITS >= 64
2013 *puReg = ASMAtomicXchgU64(puMem, *puReg);
2014#else
2015 uint64_t uOldMem = *puMem;
2016 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
2017 ASMNopPause();
2018 *puReg = uOldMem;
2019#endif
2020}
2021
2022# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2023
2024IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
2025{
2026 *puReg = ASMAtomicXchgU32(puMem, *puReg);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
2031{
2032 *puReg = ASMAtomicXchgU16(puMem, *puReg);
2033}
2034
2035
2036IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
2037{
2038 *puReg = ASMAtomicXchgU8(puMem, *puReg);
2039}
2040
2041# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2042
2043
2044/* Unlocked variants for fDisregardLock mode: */
2045
2046IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
2047{
2048 uint64_t const uOld = *puMem;
2049 *puMem = *puReg;
2050 *puReg = uOld;
2051}
2052
2053# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2054
2055IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
2056{
2057 uint32_t const uOld = *puMem;
2058 *puMem = *puReg;
2059 *puReg = uOld;
2060}
2061
2062
2063IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
2064{
2065 uint16_t const uOld = *puMem;
2066 *puMem = *puReg;
2067 *puReg = uOld;
2068}
2069
2070
2071IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
2072{
2073 uint8_t const uOld = *puMem;
2074 *puMem = *puReg;
2075 *puReg = uOld;
2076}
2077
2078# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2079
2080
2081/*
2082 * XADD and LOCK XADD.
2083 */
2084#define EMIT_XADD(a_cBitsWidth, a_Type) \
2085IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2086{ \
2087 a_Type uDst = *puDst; \
2088 a_Type uResult = uDst; \
2089 *pfEFlags = iemAImpl_add_u ## a_cBitsWidth(*pfEFlags, &uResult, *puReg); \
2090 *puDst = uResult; \
2091 *puReg = uDst; \
2092} \
2093\
2094IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2095{ \
2096 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2097 a_Type uResult; \
2098 uint32_t fEflTmp; \
2099 do \
2100 { \
2101 uResult = uOld; \
2102 fEflTmp = iemAImpl_add_u ## a_cBitsWidth(*pfEFlags, &uResult, *puReg); \
2103 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2104 *puReg = uOld; \
2105 *pfEFlags = fEflTmp; \
2106}
2107EMIT_XADD(64, uint64_t)
2108# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2109EMIT_XADD(32, uint32_t)
2110EMIT_XADD(16, uint16_t)
2111EMIT_XADD(8, uint8_t)
2112# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2113
2114#endif
2115
2116/*
2117 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2118 *
2119 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2120 * instructions are emulated as locked.
2121 */
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2125{
2126 uint8_t uOld = *puAl;
2127 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2128 Assert(*puAl == uOld);
2129 *pEFlags = iemAImpl_cmp_u8(*pEFlags, &uOld, *puAl);
2130}
2131
2132
2133IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2134{
2135 uint16_t uOld = *puAx;
2136 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2137 Assert(*puAx == uOld);
2138 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, *puAx);
2139}
2140
2141
2142IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2143{
2144 uint32_t uOld = *puEax;
2145 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2146 Assert(*puEax == uOld);
2147 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, *puEax);
2148}
2149
2150
2151# if ARCH_BITS == 32
2152IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2153# else
2154IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2155# endif
2156{
2157# if ARCH_BITS == 32
2158 uint64_t const uSrcReg = *puSrcReg;
2159# endif
2160 uint64_t uOld = *puRax;
2161 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2162 Assert(*puRax == uOld);
2163 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, *puRax);
2164}
2165
2166
2167IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2168 uint32_t *pEFlags))
2169{
2170 uint64_t const uNew = pu64EbxEcx->u;
2171 uint64_t const uOld = pu64EaxEdx->u;
2172 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2173 {
2174 Assert(pu64EaxEdx->u == uOld);
2175 *pEFlags |= X86_EFL_ZF;
2176 }
2177 else
2178 *pEFlags &= ~X86_EFL_ZF;
2179}
2180
2181
2182# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2183IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2184 uint32_t *pEFlags))
2185{
2186# ifdef VBOX_STRICT
2187 RTUINT128U const uOld = *pu128RaxRdx;
2188# endif
2189# if defined(RT_ARCH_AMD64)
2190 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2191 &pu128RaxRdx->u))
2192# else
2193 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2194# endif
2195 {
2196 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2197 *pEFlags |= X86_EFL_ZF;
2198 }
2199 else
2200 *pEFlags &= ~X86_EFL_ZF;
2201}
2202# endif
2203
2204#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2205
2206# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2207IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2208 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2209{
2210 RTUINT128U u128Tmp = *pu128Dst;
2211 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2212 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2213 {
2214 *pu128Dst = *pu128RbxRcx;
2215 *pEFlags |= X86_EFL_ZF;
2216 }
2217 else
2218 {
2219 *pu128RaxRdx = u128Tmp;
2220 *pEFlags &= ~X86_EFL_ZF;
2221 }
2222}
2223#endif /* !RT_ARCH_ARM64 */
2224
2225#if defined(IEM_WITHOUT_ASSEMBLY)
2226
2227/* Unlocked versions mapped to the locked ones: */
2228
2229IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2230{
2231 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2232}
2233
2234
2235IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2236{
2237# if 0
2238 /* If correctly aligned, used the locked variation. */
2239 if (!((uintptr_t)pu16Dst & 1))
2240 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2241 else
2242# endif
2243 {
2244 /* Otherwise emulate it as best as we can. */
2245 uint16_t const uOld = *puAx;
2246 uint16_t const uDst = *pu16Dst;
2247 if (uOld == uDst)
2248 {
2249 *pu16Dst = uSrcReg;
2250 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, uOld);
2251 }
2252 else
2253 {
2254 *puAx = uDst;
2255 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, uDst);
2256 }
2257 }
2258}
2259
2260
2261IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2262{
2263# if 0
2264 /* If correctly aligned, used the locked variation. */
2265 if (!((uintptr_t)pu32Dst & 3))
2266 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2267 else
2268# endif
2269 {
2270 /* Otherwise emulate it as best as we can. */
2271 uint32_t const uOld = *puEax;
2272 uint32_t const uDst = *pu32Dst;
2273 if (uOld == uDst)
2274 {
2275 *pu32Dst = uSrcReg;
2276 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, uOld);
2277 }
2278 else
2279 {
2280 *puEax = uDst;
2281 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, uDst);
2282 }
2283 }
2284}
2285
2286
2287# if ARCH_BITS == 32
2288IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2289{
2290# if 0
2291 /* If correctly aligned, used the locked variation. */
2292 if (!((uintptr_t)pu32Dst & 7))
2293 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2294 else
2295# endif
2296 {
2297 /* Otherwise emulate it as best as we can. */
2298 uint64_t const uOld = *puRax;
2299 uint64_t const uSrc = *puSrcReg;
2300 uint64_t const uDst = *pu64Dst;
2301 if (uOld == uDst)
2302 {
2303 *pu64Dst = uSrc;
2304 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uOld);
2305 }
2306 else
2307 {
2308 *puRax = uDst;
2309 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uDst);
2310 }
2311 }
2312}
2313# else /* ARCH_BITS != 32 */
2314IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2315{
2316# if 0
2317 /* If correctly aligned, used the locked variation. */
2318 if (!((uintptr_t)pu64Dst & 7))
2319 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2320 else
2321# endif
2322 {
2323 /* Otherwise emulate it as best as we can. */
2324 uint64_t const uOld = *puRax;
2325 uint64_t const uDst = *pu64Dst;
2326 if (uOld == uDst)
2327 {
2328 *pu64Dst = uSrcReg;
2329 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uOld);
2330 }
2331 else
2332 {
2333 *puRax = uDst;
2334 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uDst);
2335 }
2336 }
2337}
2338# endif /* ARCH_BITS != 32 */
2339
2340
2341IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2342{
2343# if 0
2344 /* If correctly aligned, used the locked variation. */
2345 if (!((uintptr_t)pu64Dst & 7))
2346 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2347 else
2348# endif
2349 {
2350 /* Otherwise emulate it as best as we can. */
2351 uint64_t const uNew = pu64EbxEcx->u;
2352 uint64_t const uOld = pu64EaxEdx->u;
2353 uint64_t const uDst = *pu64Dst;
2354 if (uDst == uOld)
2355 {
2356 *pu64Dst = uNew;
2357 *pEFlags |= X86_EFL_ZF;
2358 }
2359 else
2360 {
2361 pu64EaxEdx->u = uDst;
2362 *pEFlags &= ~X86_EFL_ZF;
2363 }
2364 }
2365}
2366
2367
2368IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2369 uint32_t *pEFlags))
2370{
2371# if 0
2372 /* If correctly aligned, used the locked variation. */
2373 if (!((uintptr_t)pu64Dst & 15))
2374 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2375 else
2376# endif
2377 {
2378 /* Otherwise emulate it as best as we can. */
2379# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2380 uint128_t const uNew = pu128RbxRcx->u;
2381 uint128_t const uOld = pu128RaxRdx->u;
2382 uint128_t const uDst = pu128Dst->u;
2383 if (uDst == uOld)
2384 {
2385 pu128Dst->u = uNew;
2386 *pEFlags |= X86_EFL_ZF;
2387 }
2388 else
2389 {
2390 pu128RaxRdx->u = uDst;
2391 *pEFlags &= ~X86_EFL_ZF;
2392 }
2393# else
2394 RTUINT128U const uNew = *pu128RbxRcx;
2395 RTUINT128U const uOld = *pu128RaxRdx;
2396 RTUINT128U const uDst = *pu128Dst;
2397 if ( uDst.s.Lo == uOld.s.Lo
2398 && uDst.s.Hi == uOld.s.Hi)
2399 {
2400 *pu128Dst = uNew;
2401 *pEFlags |= X86_EFL_ZF;
2402 }
2403 else
2404 {
2405 *pu128RaxRdx = uDst;
2406 *pEFlags &= ~X86_EFL_ZF;
2407 }
2408# endif
2409 }
2410}
2411
2412#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2413
2414#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2415 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2416
2417/*
2418 * MUL, IMUL, DIV and IDIV helpers.
2419 *
2420 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2421 * division step so we can select between using C operators and
2422 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2423 *
2424 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2425 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2426 * input loads and the result storing.
2427 */
2428
2429DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2430{
2431# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2432 pQuotient->s.Lo = 0;
2433 pQuotient->s.Hi = 0;
2434# endif
2435 RTUINT128U Divisor;
2436 Divisor.s.Lo = u64Divisor;
2437 Divisor.s.Hi = 0;
2438 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2439}
2440
2441# define DIV_LOAD(a_Dividend) \
2442 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2443# define DIV_LOAD_U8(a_Dividend) \
2444 a_Dividend.u = *puAX
2445
2446# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2447# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2448
2449# define MUL_LOAD_F1() *puA
2450# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2451
2452# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2453# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2454
2455# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2456 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2457# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2458 RTUInt128AssignNeg(&(a_Value))
2459
2460# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2461 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2462# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2463 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2464
2465# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2466 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2467 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2468# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2469 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2470
2471
2472/*
2473 * MUL
2474 */
2475# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2476IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2477{ \
2478 RTUINT ## a_cBitsWidth2x ## U Result; \
2479 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2480 a_fnStore(Result); \
2481 \
2482 /* Calc EFLAGS: */ \
2483 uint32_t fEfl = *pfEFlags; \
2484 if (a_fIntelFlags) \
2485 { /* Intel: 6700K and 10980XE behavior */ \
2486 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2487 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2488 fEfl |= X86_EFL_SF; \
2489 fEfl |= IEM_EFL_CALC_PARITY(Result.s.Lo); \
2490 if (Result.s.Hi != 0) \
2491 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2492 } \
2493 else \
2494 { /* AMD: 3990X */ \
2495 if (Result.s.Hi != 0) \
2496 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2497 else \
2498 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2499 } \
2500 *pfEFlags = fEfl; \
2501 return 0; \
2502} \
2503
2504# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2505 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2506 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2507 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2508
2509# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2510EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2511 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2512# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2513EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2514 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2515EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2516 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2517EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2518 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2519# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2520# endif /* !DOXYGEN_RUNNING */
2521
2522/*
2523 * MULX
2524 */
2525# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2526IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2527 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2528{ \
2529 RTUINT ## a_cBitsWidth2x ## U Result; \
2530 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2531 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2532 *puDst1 = Result.s.Hi; \
2533} \
2534
2535# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2536EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2537EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2538# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2539EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2540EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2541# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2542# endif /* !DOXYGEN_RUNNING */
2543
2544
2545/*
2546 * IMUL
2547 *
2548 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2549 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2550 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2551 */
2552# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2553 a_Suffix, a_fIntelFlags) \
2554IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2555{ \
2556 RTUINT ## a_cBitsWidth2x ## U Result; \
2557 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2558 \
2559 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2560 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2561 { \
2562 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2563 { \
2564 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2565 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2566 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2567 } \
2568 else \
2569 { \
2570 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2571 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2572 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2573 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2574 a_fnNeg(Result, a_cBitsWidth2x); \
2575 } \
2576 } \
2577 else \
2578 { \
2579 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2580 { \
2581 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2582 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2583 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2584 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2585 a_fnNeg(Result, a_cBitsWidth2x); \
2586 } \
2587 else \
2588 { \
2589 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2590 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2591 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2592 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2593 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2594 } \
2595 } \
2596 a_fnStore(Result); \
2597 \
2598 if (a_fIntelFlags) \
2599 { \
2600 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2601 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2602 fEfl |= X86_EFL_SF; \
2603 fEfl |= IEM_EFL_CALC_PARITY(Result.s.Lo & 0xff); \
2604 } \
2605 *pfEFlags = fEfl; \
2606 return 0; \
2607}
2608# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2609 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2610 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2611 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2612
2613# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2614EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2615 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2616# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2617EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2618 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2619EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2620 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2621EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2622 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2623# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2624# endif /* !DOXYGEN_RUNNING */
2625
2626
2627/*
2628 * IMUL with two operands are mapped onto the three operand variant, ignoring
2629 * the high part of the product.
2630 */
2631# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2632IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_imul_two_u ## a_cBits,(uint32_t fEFlags, a_uType *puDst, a_uType uSrc)) \
2633{ \
2634 a_uType uIgn; \
2635 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, &fEFlags); \
2636 return fEFlags; \
2637} \
2638\
2639IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_imul_two_u ## a_cBits ## _intel,(uint32_t fEFlags, a_uType *puDst, a_uType uSrc)) \
2640{ \
2641 a_uType uIgn; \
2642 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, &fEFlags); \
2643 return fEFlags; \
2644} \
2645\
2646IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_imul_two_u ## a_cBits ## _amd,(uint32_t fEFlags, a_uType *puDst, a_uType uSrc)) \
2647{ \
2648 a_uType uIgn; \
2649 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, &fEFlags); \
2650 return fEFlags; \
2651}
2652
2653EMIT_IMUL_TWO(64, uint64_t)
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655EMIT_IMUL_TWO(32, uint32_t)
2656EMIT_IMUL_TWO(16, uint16_t)
2657# endif
2658
2659
2660/*
2661 * DIV
2662 */
2663# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2664 a_Suffix, a_fIntelFlags) \
2665IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2666{ \
2667 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2668 a_fnLoad(Dividend); \
2669 if ( uDivisor != 0 \
2670 && Dividend.s.Hi < uDivisor) \
2671 { \
2672 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2673 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2674 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2675 \
2676 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2677 if (!a_fIntelFlags) \
2678 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2679 return 0; \
2680 } \
2681 /* #DE */ \
2682 return -1; \
2683}
2684# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2685 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2686 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2687 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2688
2689# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2690EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2691 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2692# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2693EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2694 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2695EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2696 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2697EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2698 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2699# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2700# endif /* !DOXYGEN_RUNNING */
2701
2702
2703/*
2704 * IDIV
2705 *
2706 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2707 * set AF and clear PF, ZF and SF just like it does for DIV.
2708 *
2709 */
2710# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2711 a_Suffix, a_fIntelFlags) \
2712IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2713{ \
2714 /* Note! Skylake leaves all flags alone. */ \
2715 \
2716 /** @todo overflow checks */ \
2717 if (uDivisor != 0) \
2718 { \
2719 /* \
2720 * Convert to unsigned division. \
2721 */ \
2722 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2723 a_fnLoad(Dividend); \
2724 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2725 if (fSignedDividend) \
2726 a_fnNeg(Dividend, a_cBitsWidth2x); \
2727 \
2728 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2729 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2730 uDivisorPositive = uDivisor; \
2731 else \
2732 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2733 \
2734 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2735 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2736 \
2737 /* \
2738 * Setup the result, checking for overflows. \
2739 */ \
2740 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2741 { \
2742 if (!fSignedDividend) \
2743 { \
2744 /* Positive divisor, positive dividend => result positive. */ \
2745 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2746 { \
2747 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2748 if (!a_fIntelFlags) \
2749 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2750 return 0; \
2751 } \
2752 } \
2753 else \
2754 { \
2755 /* Positive divisor, negative dividend => result negative. */ \
2756 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2757 { \
2758 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2759 if (!a_fIntelFlags) \
2760 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2761 return 0; \
2762 } \
2763 } \
2764 } \
2765 else \
2766 { \
2767 if (!fSignedDividend) \
2768 { \
2769 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2770 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2771 { \
2772 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2773 if (!a_fIntelFlags) \
2774 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2775 return 0; \
2776 } \
2777 } \
2778 else \
2779 { \
2780 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2781 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2782 { \
2783 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2784 if (!a_fIntelFlags) \
2785 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2786 return 0; \
2787 } \
2788 } \
2789 } \
2790 } \
2791 /* #DE */ \
2792 return -1; \
2793}
2794# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2795 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2796 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2797 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2798
2799# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2800EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2801 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2802# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2803EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2804 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2805EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2806 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2807EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2808 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2809# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2810# endif /* !DOXYGEN_RUNNING */
2811
2812#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2813
2814
2815/*********************************************************************************************************************************
2816* Unary operations. *
2817*********************************************************************************************************************************/
2818#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2819
2820/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2821 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2822 *
2823 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2824 * borrowing in arithmetic loops on intel 8008).
2825 *
2826 * @returns Status bits.
2827 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2828 * @param a_uResult Unsigned result value.
2829 * @param a_uDst The original destination value (for AF calc).
2830 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2831 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2832 */
2833#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2834 do { \
2835 uint32_t fEflTmp = *(a_pfEFlags); \
2836 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2837 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
2838 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2839 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2840 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2841 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2842 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2843 *(a_pfEFlags) = fEflTmp; \
2844 } while (0)
2845
2846/*
2847 * INC
2848 */
2849
2850IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2851{
2852 uint64_t uDst = *puDst;
2853 uint64_t uResult = uDst + 1;
2854 *puDst = uResult;
2855 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2856}
2857
2858# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2859
2860IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2861{
2862 uint32_t uDst = *puDst;
2863 uint32_t uResult = uDst + 1;
2864 *puDst = uResult;
2865 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2866}
2867
2868
2869IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2870{
2871 uint16_t uDst = *puDst;
2872 uint16_t uResult = uDst + 1;
2873 *puDst = uResult;
2874 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2875}
2876
2877IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2878{
2879 uint8_t uDst = *puDst;
2880 uint8_t uResult = uDst + 1;
2881 *puDst = uResult;
2882 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2883}
2884
2885# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2886
2887
2888/*
2889 * DEC
2890 */
2891
2892IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2893{
2894 uint64_t uDst = *puDst;
2895 uint64_t uResult = uDst - 1;
2896 *puDst = uResult;
2897 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2898}
2899
2900# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2901
2902IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2903{
2904 uint32_t uDst = *puDst;
2905 uint32_t uResult = uDst - 1;
2906 *puDst = uResult;
2907 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2908}
2909
2910
2911IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2912{
2913 uint16_t uDst = *puDst;
2914 uint16_t uResult = uDst - 1;
2915 *puDst = uResult;
2916 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2917}
2918
2919
2920IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2921{
2922 uint8_t uDst = *puDst;
2923 uint8_t uResult = uDst - 1;
2924 *puDst = uResult;
2925 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2926}
2927
2928# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2929
2930
2931/*
2932 * NOT
2933 */
2934
2935IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2936{
2937 uint64_t uDst = *puDst;
2938 uint64_t uResult = ~uDst;
2939 *puDst = uResult;
2940 /* EFLAGS are not modified. */
2941 RT_NOREF_PV(pfEFlags);
2942}
2943
2944# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2945
2946IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2947{
2948 uint32_t uDst = *puDst;
2949 uint32_t uResult = ~uDst;
2950 *puDst = uResult;
2951 /* EFLAGS are not modified. */
2952 RT_NOREF_PV(pfEFlags);
2953}
2954
2955IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2956{
2957 uint16_t uDst = *puDst;
2958 uint16_t uResult = ~uDst;
2959 *puDst = uResult;
2960 /* EFLAGS are not modified. */
2961 RT_NOREF_PV(pfEFlags);
2962}
2963
2964IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2965{
2966 uint8_t uDst = *puDst;
2967 uint8_t uResult = ~uDst;
2968 *puDst = uResult;
2969 /* EFLAGS are not modified. */
2970 RT_NOREF_PV(pfEFlags);
2971}
2972
2973# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2974
2975
2976/*
2977 * NEG
2978 */
2979
2980/**
2981 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2982 *
2983 * @returns Status bits.
2984 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2985 * @param a_uResult Unsigned result value.
2986 * @param a_uDst The original destination value (for AF calc).
2987 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2988 */
2989#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2990 do { \
2991 uint32_t fEflTmp = *(a_pfEFlags); \
2992 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2993 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2994 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
2995 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2996 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2997 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2998 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2999 *(a_pfEFlags) = fEflTmp; \
3000 } while (0)
3001
3002IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
3003{
3004 uint64_t uDst = *puDst;
3005 uint64_t uResult = (uint64_t)0 - uDst;
3006 *puDst = uResult;
3007 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
3008}
3009
3010# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3011
3012IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
3013{
3014 uint32_t uDst = *puDst;
3015 uint32_t uResult = (uint32_t)0 - uDst;
3016 *puDst = uResult;
3017 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
3018}
3019
3020
3021IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
3022{
3023 uint16_t uDst = *puDst;
3024 uint16_t uResult = (uint16_t)0 - uDst;
3025 *puDst = uResult;
3026 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
3027}
3028
3029
3030IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
3031{
3032 uint8_t uDst = *puDst;
3033 uint8_t uResult = (uint8_t)0 - uDst;
3034 *puDst = uResult;
3035 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
3036}
3037
3038# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3039
3040/*
3041 * Locked variants.
3042 */
3043
3044/** Emit a function for doing a locked unary operand operation. */
3045# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
3046 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
3047 uint32_t *pfEFlags)) \
3048 { \
3049 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
3050 uint ## a_cBitsWidth ## _t uTmp; \
3051 uint32_t fEflTmp; \
3052 do \
3053 { \
3054 uTmp = uOld; \
3055 fEflTmp = *pfEFlags; \
3056 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
3057 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
3058 *pfEFlags = fEflTmp; \
3059 }
3060
3061EMIT_LOCKED_UNARY_OP(inc, 64)
3062EMIT_LOCKED_UNARY_OP(dec, 64)
3063EMIT_LOCKED_UNARY_OP(not, 64)
3064EMIT_LOCKED_UNARY_OP(neg, 64)
3065# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3066EMIT_LOCKED_UNARY_OP(inc, 32)
3067EMIT_LOCKED_UNARY_OP(dec, 32)
3068EMIT_LOCKED_UNARY_OP(not, 32)
3069EMIT_LOCKED_UNARY_OP(neg, 32)
3070
3071EMIT_LOCKED_UNARY_OP(inc, 16)
3072EMIT_LOCKED_UNARY_OP(dec, 16)
3073EMIT_LOCKED_UNARY_OP(not, 16)
3074EMIT_LOCKED_UNARY_OP(neg, 16)
3075
3076EMIT_LOCKED_UNARY_OP(inc, 8)
3077EMIT_LOCKED_UNARY_OP(dec, 8)
3078EMIT_LOCKED_UNARY_OP(not, 8)
3079EMIT_LOCKED_UNARY_OP(neg, 8)
3080# endif
3081
3082#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
3083
3084
3085/*********************************************************************************************************************************
3086* Shifting and Rotating *
3087*********************************************************************************************************************************/
3088
3089/*
3090 * ROL
3091 */
3092#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3093IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3094{ \
3095 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3096 if (cShift) \
3097 { \
3098 if (a_cBitsWidth < 32) \
3099 cShift &= a_cBitsWidth - 1; \
3100 a_uType const uDst = *puDst; \
3101 a_uType const uResult = a_fnHlp(uDst, cShift); \
3102 *puDst = uResult; \
3103 \
3104 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3105 it the same way as for 1 bit shifts. */ \
3106 AssertCompile(X86_EFL_CF_BIT == 0); \
3107 fEFlags &= ~(X86_EFL_CF | X86_EFL_OF); \
3108 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3109 fEFlags |= fCarry; \
3110 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3111 fEFlags |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3112 else /* Intel 10980XE: According to the first sub-shift: */ \
3113 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3114 } \
3115 return fEFlags; \
3116}
3117
3118#ifndef RT_ARCH_ARM64
3119
3120# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3121EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3122# endif
3123EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3124EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3125
3126# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3127EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3128# endif
3129EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3130EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3131
3132DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3133{
3134 return (uValue << cShift) | (uValue >> (16 - cShift));
3135}
3136# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3137EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3138# endif
3139EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3140EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3141
3142DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3143{
3144 return (uValue << cShift) | (uValue >> (8 - cShift));
3145}
3146# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3147EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3148# endif
3149EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3150EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3151
3152#endif /* !RT_ARCH_ARM64 */
3153
3154/*
3155 * ROR
3156 */
3157#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3158IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3159{ \
3160 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3161 if (cShift) \
3162 { \
3163 if (a_cBitsWidth < 32) \
3164 cShift &= a_cBitsWidth - 1; \
3165 a_uType const uDst = *puDst; \
3166 a_uType const uResult = a_fnHlp(uDst, cShift); \
3167 *puDst = uResult; \
3168 \
3169 /* Calc EFLAGS: */ \
3170 AssertCompile(X86_EFL_CF_BIT == 0); \
3171 fEFlags &= ~(X86_EFL_CF | X86_EFL_OF); \
3172 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3173 fEFlags |= fCarry; \
3174 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3175 fEFlags |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3176 else /* Intel 10980XE: According to the first sub-shift: */ \
3177 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3178 } \
3179 return fEFlags; \
3180}
3181
3182#ifndef RT_ARCH_ARM64
3183
3184# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3186# endif
3187EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3188EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3189
3190# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3192# endif
3193EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3194EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3195
3196DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3197{
3198 return (uValue >> cShift) | (uValue << (16 - cShift));
3199}
3200# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3201EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3202# endif
3203EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3204EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3205
3206DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3207{
3208 return (uValue >> cShift) | (uValue << (8 - cShift));
3209}
3210# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3211EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3212# endif
3213EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3214EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3215
3216#endif /* !RT_ARCH_ARM64 */
3217
3218/*
3219 * RCL
3220 */
3221#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3222IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3223{ \
3224 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3225 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3226 cShift %= a_cBitsWidth + 1; \
3227 if (cShift) \
3228 { \
3229 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3230 cShift %= a_cBitsWidth + 1; \
3231 a_uType const uDst = *puDst; \
3232 a_uType uResult = uDst << cShift; \
3233 if (cShift > 1) \
3234 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3235 \
3236 AssertCompile(X86_EFL_CF_BIT == 0); \
3237 uint32_t fInCarry = fEFlags & X86_EFL_CF; \
3238 uResult |= (a_uType)fInCarry << (cShift - 1); \
3239 \
3240 *puDst = uResult; \
3241 \
3242 /* Calc EFLAGS. */ \
3243 fEFlags &= ~(X86_EFL_CF | X86_EFL_OF); \
3244 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3245 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3246 fEFlags |= fOutCarry; \
3247 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3248 fEFlags |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3249 else /* Intel 10980XE: According to the first sub-shift: */ \
3250 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3251 } \
3252 return fEFlags; \
3253}
3254
3255#ifndef RT_ARCH_ARM64
3256
3257# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3258EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3259# endif
3260EMIT_RCL(64, uint64_t, _intel, 1)
3261EMIT_RCL(64, uint64_t, _amd, 0)
3262
3263# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3264EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3265# endif
3266EMIT_RCL(32, uint32_t, _intel, 1)
3267EMIT_RCL(32, uint32_t, _amd, 0)
3268
3269# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3270EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3271# endif
3272EMIT_RCL(16, uint16_t, _intel, 1)
3273EMIT_RCL(16, uint16_t, _amd, 0)
3274
3275# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3276EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3277# endif
3278EMIT_RCL(8, uint8_t, _intel, 1)
3279EMIT_RCL(8, uint8_t, _amd, 0)
3280
3281#endif /* !RT_ARCH_ARM64 */
3282
3283
3284/*
3285 * RCR
3286 */
3287#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3288IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3289{ \
3290 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3291 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3292 cShift %= a_cBitsWidth + 1; \
3293 if (cShift) \
3294 { \
3295 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3296 cShift %= a_cBitsWidth + 1; \
3297 a_uType const uDst = *puDst; \
3298 a_uType uResult = uDst >> cShift; \
3299 if (cShift > 1) \
3300 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3301 \
3302 AssertCompile(X86_EFL_CF_BIT == 0); \
3303 uint32_t fInCarry = fEFlags & X86_EFL_CF; \
3304 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3305 *puDst = uResult; \
3306 \
3307 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3308 it the same way as for 1 bit shifts. */ \
3309 fEFlags &= ~(X86_EFL_CF | X86_EFL_OF); \
3310 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3311 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3312 fEFlags |= fOutCarry; \
3313 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3314 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3315 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3316 fEFlags |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3317 } \
3318 return fEFlags; \
3319}
3320
3321#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3322EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3323#endif
3324EMIT_RCR(64, uint64_t, _intel, 1)
3325EMIT_RCR(64, uint64_t, _amd, 0)
3326
3327#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3328EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3329#endif
3330EMIT_RCR(32, uint32_t, _intel, 1)
3331EMIT_RCR(32, uint32_t, _amd, 0)
3332
3333#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3334EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3335#endif
3336EMIT_RCR(16, uint16_t, _intel, 1)
3337EMIT_RCR(16, uint16_t, _amd, 0)
3338
3339#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3340EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3341#endif
3342EMIT_RCR(8, uint8_t, _intel, 1)
3343EMIT_RCR(8, uint8_t, _amd, 0)
3344
3345
3346/*
3347 * SHL
3348 */
3349#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3350IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3351{ \
3352 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3353 if (cShift) \
3354 { \
3355 a_uType const uDst = *puDst; \
3356 a_uType uResult = uDst << cShift; \
3357 *puDst = uResult; \
3358 \
3359 /* Calc EFLAGS. */ \
3360 AssertCompile(X86_EFL_CF_BIT == 0); \
3361 fEFlags &= ~X86_EFL_STATUS_BITS; \
3362 uint32_t const fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3363 fEFlags |= fCarry; \
3364 if (!a_fIntelFlags) \
3365 fEFlags |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3366 else \
3367 fEFlags |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3368 fEFlags |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3369 fEFlags |= X86_EFL_CALC_ZF(uResult); \
3370 fEFlags |= IEM_EFL_CALC_PARITY(uResult); \
3371 if (!a_fIntelFlags) \
3372 fEFlags |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3373 } \
3374 return fEFlags; \
3375}
3376
3377#if !defined(RT_ARCH_ARM64)
3378
3379# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3380EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3381# endif
3382EMIT_SHL(64, uint64_t, _intel, 1)
3383EMIT_SHL(64, uint64_t, _amd, 0)
3384
3385# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3386EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3387# endif
3388EMIT_SHL(32, uint32_t, _intel, 1)
3389EMIT_SHL(32, uint32_t, _amd, 0)
3390
3391# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3392EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3393# endif
3394EMIT_SHL(16, uint16_t, _intel, 1)
3395EMIT_SHL(16, uint16_t, _amd, 0)
3396
3397# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3398EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3399# endif
3400EMIT_SHL(8, uint8_t, _intel, 1)
3401EMIT_SHL(8, uint8_t, _amd, 0)
3402
3403#endif /* !RT_ARCH_ARM64 */
3404
3405
3406/*
3407 * SHR
3408 */
3409#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3410IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3411{ \
3412 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3413 if (cShift) \
3414 { \
3415 a_uType const uDst = *puDst; \
3416 a_uType uResult = uDst >> cShift; \
3417 *puDst = uResult; \
3418 \
3419 /* Calc EFLAGS. */ \
3420 AssertCompile(X86_EFL_CF_BIT == 0); \
3421 fEFlags &= ~X86_EFL_STATUS_BITS; \
3422 fEFlags |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3423 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3424 fEFlags |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3425 fEFlags |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3426 fEFlags |= X86_EFL_CALC_ZF(uResult); \
3427 fEFlags |= IEM_EFL_CALC_PARITY(uResult); \
3428 if (!a_fIntelFlags) \
3429 fEFlags |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3430 } \
3431 return fEFlags; \
3432}
3433
3434#if !defined(RT_ARCH_ARM64)
3435
3436# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3437EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3438# endif
3439EMIT_SHR(64, uint64_t, _intel, 1)
3440EMIT_SHR(64, uint64_t, _amd, 0)
3441
3442# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3443EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3444# endif
3445EMIT_SHR(32, uint32_t, _intel, 1)
3446EMIT_SHR(32, uint32_t, _amd, 0)
3447
3448# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3449EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3450# endif
3451EMIT_SHR(16, uint16_t, _intel, 1)
3452EMIT_SHR(16, uint16_t, _amd, 0)
3453
3454# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3455EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3456# endif
3457EMIT_SHR(8, uint8_t, _intel, 1)
3458EMIT_SHR(8, uint8_t, _amd, 0)
3459
3460#endif /* !RT_ARCH_ARM64 */
3461
3462
3463/*
3464 * SAR
3465 */
3466#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3467IEM_DECL_IMPL_DEF(uint32_t, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(uint32_t fEFlags, a_uType *puDst, uint8_t cShift)) \
3468{ \
3469 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3470 if (cShift) \
3471 { \
3472 a_iType const iDst = (a_iType)*puDst; \
3473 a_uType uResult = iDst >> cShift; \
3474 *puDst = uResult; \
3475 \
3476 /* Calc EFLAGS. \
3477 Note! The OF flag is always zero because the result never differs from the input. */ \
3478 AssertCompile(X86_EFL_CF_BIT == 0); \
3479 fEFlags &= ~X86_EFL_STATUS_BITS; \
3480 fEFlags |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3481 fEFlags |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3482 fEFlags |= X86_EFL_CALC_ZF(uResult); \
3483 fEFlags |= IEM_EFL_CALC_PARITY(uResult); \
3484 if (!a_fIntelFlags) \
3485 fEFlags |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3486 } \
3487 return fEFlags; \
3488}
3489
3490#if !defined(RT_ARCH_ARM64)
3491
3492# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3493EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3494# endif
3495EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3496EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3497
3498# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3499EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3500# endif
3501EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3502EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3503
3504# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3505EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3506# endif
3507EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3508EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3509
3510# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3511EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3512# endif
3513EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3514EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3515
3516#endif /* !RT_ARCH_ARM64 */
3517
3518
3519/*
3520 * SHLD
3521 *
3522 * - CF is the last bit shifted out of puDst.
3523 * - AF is always cleared by Intel 10980XE.
3524 * - AF is always set by AMD 3990X.
3525 * - OF is set according to the first shift on Intel 10980XE, it seems.
3526 * - OF is set according to the last sub-shift on AMD 3990X.
3527 * - ZF, SF and PF are calculated according to the result by both vendors.
3528 *
3529 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3530 * pick either the source register or the destination register for input bits
3531 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3532 * intel has changed behaviour here several times. We implement what current
3533 * skylake based does for now, we can extend this later as needed.
3534 */
3535#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3536IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3537 uint32_t *pfEFlags)) \
3538{ \
3539 cShift &= a_cBitsWidth - 1; \
3540 if (cShift) \
3541 { \
3542 a_uType const uDst = *puDst; \
3543 a_uType uResult = uDst << cShift; \
3544 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3545 *puDst = uResult; \
3546 \
3547 /* CALC EFLAGS: */ \
3548 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3549 if (a_fIntelFlags) \
3550 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3551 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3552 else \
3553 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3554 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3555 fEfl |= X86_EFL_AF; \
3556 } \
3557 AssertCompile(X86_EFL_CF_BIT == 0); \
3558 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3559 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3560 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3561 fEfl |= X86_EFL_CALC_ZF(uResult); \
3562 *pfEFlags = fEfl; \
3563 } \
3564}
3565
3566#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3567EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3568#endif
3569EMIT_SHLD(64, uint64_t, _intel, 1)
3570EMIT_SHLD(64, uint64_t, _amd, 0)
3571
3572#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3573EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3574#endif
3575EMIT_SHLD(32, uint32_t, _intel, 1)
3576EMIT_SHLD(32, uint32_t, _amd, 0)
3577
3578#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3579IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3580{ \
3581 cShift &= 31; \
3582 if (cShift) \
3583 { \
3584 uint16_t const uDst = *puDst; \
3585 uint64_t const uTmp = a_fIntelFlags \
3586 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3587 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3588 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3589 *puDst = uResult; \
3590 \
3591 /* CALC EFLAGS: */ \
3592 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3593 AssertCompile(X86_EFL_CF_BIT == 0); \
3594 if (a_fIntelFlags) \
3595 { \
3596 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3597 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3598 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3599 } \
3600 else \
3601 { \
3602 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3603 if (cShift < 16) \
3604 { \
3605 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3606 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3607 } \
3608 else \
3609 { \
3610 if (cShift == 16) \
3611 fEfl |= uDst & X86_EFL_CF; \
3612 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3613 } \
3614 fEfl |= X86_EFL_AF; \
3615 } \
3616 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3617 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3618 fEfl |= X86_EFL_CALC_ZF(uResult); \
3619 *pfEFlags = fEfl; \
3620 } \
3621}
3622
3623#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3624EMIT_SHLD_16(RT_NOTHING, 1)
3625#endif
3626EMIT_SHLD_16(_intel, 1)
3627EMIT_SHLD_16(_amd, 0)
3628
3629
3630/*
3631 * SHRD
3632 *
3633 * EFLAGS behaviour seems to be the same as with SHLD:
3634 * - CF is the last bit shifted out of puDst.
3635 * - AF is always cleared by Intel 10980XE.
3636 * - AF is always set by AMD 3990X.
3637 * - OF is set according to the first shift on Intel 10980XE, it seems.
3638 * - OF is set according to the last sub-shift on AMD 3990X.
3639 * - ZF, SF and PF are calculated according to the result by both vendors.
3640 *
3641 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3642 * pick either the source register or the destination register for input bits
3643 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3644 * intel has changed behaviour here several times. We implement what current
3645 * skylake based does for now, we can extend this later as needed.
3646 */
3647#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3648IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3649{ \
3650 cShift &= a_cBitsWidth - 1; \
3651 if (cShift) \
3652 { \
3653 a_uType const uDst = *puDst; \
3654 a_uType uResult = uDst >> cShift; \
3655 uResult |= uSrc << (a_cBitsWidth - cShift); \
3656 *puDst = uResult; \
3657 \
3658 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3659 AssertCompile(X86_EFL_CF_BIT == 0); \
3660 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3661 if (a_fIntelFlags) \
3662 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3663 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3664 else \
3665 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3666 if (cShift > 1) /* Set according to last shift. */ \
3667 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3668 else \
3669 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3670 fEfl |= X86_EFL_AF; \
3671 } \
3672 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3673 fEfl |= X86_EFL_CALC_ZF(uResult); \
3674 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3675 *pfEFlags = fEfl; \
3676 } \
3677}
3678
3679#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3680EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3681#endif
3682EMIT_SHRD(64, uint64_t, _intel, 1)
3683EMIT_SHRD(64, uint64_t, _amd, 0)
3684
3685#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3686EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3687#endif
3688EMIT_SHRD(32, uint32_t, _intel, 1)
3689EMIT_SHRD(32, uint32_t, _amd, 0)
3690
3691#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3692IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3693{ \
3694 cShift &= 31; \
3695 if (cShift) \
3696 { \
3697 uint16_t const uDst = *puDst; \
3698 uint64_t const uTmp = a_fIntelFlags \
3699 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3700 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3701 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3702 *puDst = uResult; \
3703 \
3704 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3705 AssertCompile(X86_EFL_CF_BIT == 0); \
3706 if (a_fIntelFlags) \
3707 { \
3708 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3709 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3710 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3711 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3712 } \
3713 else \
3714 { \
3715 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3716 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3717 /* AMD 3990X: Set according to last shift. AF always set. */ \
3718 if (cShift > 1) /* Set according to last shift. */ \
3719 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3720 else \
3721 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3722 fEfl |= X86_EFL_AF; \
3723 } \
3724 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3725 fEfl |= X86_EFL_CALC_ZF(uResult); \
3726 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3727 *pfEFlags = fEfl; \
3728 } \
3729}
3730
3731#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3732EMIT_SHRD_16(RT_NOTHING, 1)
3733#endif
3734EMIT_SHRD_16(_intel, 1)
3735EMIT_SHRD_16(_amd, 0)
3736
3737
3738/*
3739 * RORX (BMI2)
3740 */
3741#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3742IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3743{ \
3744 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3745}
3746
3747#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3748EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3749#endif
3750#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3751EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3752#endif
3753
3754
3755/*
3756 * SHLX (BMI2)
3757 */
3758#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3759IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3760{ \
3761 cShift &= a_cBitsWidth - 1; \
3762 *puDst = uSrc << cShift; \
3763}
3764
3765#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3766EMIT_SHLX(64, uint64_t, RT_NOTHING)
3767EMIT_SHLX(64, uint64_t, _fallback)
3768#endif
3769#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3770EMIT_SHLX(32, uint32_t, RT_NOTHING)
3771EMIT_SHLX(32, uint32_t, _fallback)
3772#endif
3773
3774
3775/*
3776 * SHRX (BMI2)
3777 */
3778#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3779IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3780{ \
3781 cShift &= a_cBitsWidth - 1; \
3782 *puDst = uSrc >> cShift; \
3783}
3784
3785#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3786EMIT_SHRX(64, uint64_t, RT_NOTHING)
3787EMIT_SHRX(64, uint64_t, _fallback)
3788#endif
3789#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3790EMIT_SHRX(32, uint32_t, RT_NOTHING)
3791EMIT_SHRX(32, uint32_t, _fallback)
3792#endif
3793
3794
3795/*
3796 * SARX (BMI2)
3797 */
3798#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3799IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3800{ \
3801 cShift &= a_cBitsWidth - 1; \
3802 *puDst = (a_iType)uSrc >> cShift; \
3803}
3804
3805#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3806EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3807EMIT_SARX(64, uint64_t, int64_t, _fallback)
3808#endif
3809#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3810EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3811EMIT_SARX(32, uint32_t, int32_t, _fallback)
3812#endif
3813
3814
3815/*
3816 * PDEP (BMI2)
3817 */
3818#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3819IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3820{ \
3821 a_uType uResult = 0; \
3822 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3823 if (fMask & ((a_uType)1 << iMaskBit)) \
3824 { \
3825 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3826 iBit++; \
3827 } \
3828 *puDst = uResult; \
3829}
3830
3831#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3832EMIT_PDEP(64, uint64_t, RT_NOTHING)
3833#endif
3834EMIT_PDEP(64, uint64_t, _fallback)
3835#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3836EMIT_PDEP(32, uint32_t, RT_NOTHING)
3837#endif
3838EMIT_PDEP(32, uint32_t, _fallback)
3839
3840/*
3841 * PEXT (BMI2)
3842 */
3843#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3844IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3845{ \
3846 a_uType uResult = 0; \
3847 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3848 if (fMask & ((a_uType)1 << iMaskBit)) \
3849 { \
3850 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3851 iBit++; \
3852 } \
3853 *puDst = uResult; \
3854}
3855
3856#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3857EMIT_PEXT(64, uint64_t, RT_NOTHING)
3858#endif
3859EMIT_PEXT(64, uint64_t, _fallback)
3860#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3861EMIT_PEXT(32, uint32_t, RT_NOTHING)
3862#endif
3863EMIT_PEXT(32, uint32_t, _fallback)
3864
3865
3866#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3867
3868# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3869/*
3870 * BSWAP
3871 */
3872
3873IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3874{
3875 *puDst = ASMByteSwapU64(*puDst);
3876}
3877
3878
3879IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3880{
3881 *puDst = ASMByteSwapU32(*puDst);
3882}
3883
3884
3885/* Note! undocument, so 32-bit arg */
3886IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3887{
3888#if 0
3889 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3890#else
3891 /* This is the behaviour AMD 3990x (64-bit mode): */
3892 *(uint16_t *)puDst = 0;
3893#endif
3894}
3895
3896# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3897
3898
3899
3900# if defined(IEM_WITHOUT_ASSEMBLY)
3901
3902/*
3903 * LFENCE, SFENCE & MFENCE.
3904 */
3905
3906IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3907{
3908 ASMReadFence();
3909}
3910
3911
3912IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3913{
3914 ASMWriteFence();
3915}
3916
3917
3918IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3919{
3920 ASMMemoryFence();
3921}
3922
3923
3924# ifndef RT_ARCH_ARM64
3925IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3926{
3927 ASMMemoryFence();
3928}
3929# endif
3930
3931# endif
3932
3933#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3934
3935
3936IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_arpl,(uint32_t fEFlags, uint16_t *pu16Dst, uint16_t u16Src))
3937{
3938 uint16_t u16Dst = *pu16Dst;
3939 if ((u16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3940 {
3941 u16Dst &= X86_SEL_MASK_OFF_RPL;
3942 u16Dst |= u16Src & X86_SEL_RPL;
3943 *pu16Dst = u16Dst;
3944
3945 fEFlags |= X86_EFL_ZF;
3946 }
3947 else
3948 fEFlags &= ~X86_EFL_ZF;
3949 return fEFlags;
3950}
3951
3952
3953#if defined(IEM_WITHOUT_ASSEMBLY)
3954
3955/*********************************************************************************************************************************
3956* x87 FPU Loads *
3957*********************************************************************************************************************************/
3958
3959IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3960{
3961 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3962 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3963 {
3964 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3965 pFpuRes->r80Result.sj64.fInteger = 1;
3966 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3967 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3968 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3969 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3970 }
3971 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3972 {
3973 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3974 pFpuRes->r80Result.s.uExponent = 0;
3975 pFpuRes->r80Result.s.uMantissa = 0;
3976 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3977 }
3978 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3979 {
3980 /* Subnormal values gets normalized. */
3981 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3982 pFpuRes->r80Result.sj64.fInteger = 1;
3983 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3984 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3985 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3986 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3987 pFpuRes->FSW |= X86_FSW_DE;
3988 if (!(pFpuState->FCW & X86_FCW_DM))
3989 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3990 }
3991 else if (RTFLOAT32U_IS_INF(pr32Val))
3992 {
3993 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3994 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3995 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3996 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3997 }
3998 else
3999 {
4000 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
4001 Assert(RTFLOAT32U_IS_NAN(pr32Val));
4002 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
4003 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
4004 pFpuRes->r80Result.sj64.fInteger = 1;
4005 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
4006 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
4007 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
4008 {
4009 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
4010 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4011 pFpuRes->FSW |= X86_FSW_IE;
4012
4013 if (!(pFpuState->FCW & X86_FCW_IM))
4014 {
4015 /* The value is not pushed. */
4016 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
4017 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
4018 pFpuRes->r80Result.au64[0] = 0;
4019 pFpuRes->r80Result.au16[4] = 0;
4020 }
4021 }
4022 else
4023 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4024 }
4025}
4026
4027
4028IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
4029{
4030 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4031 if (RTFLOAT64U_IS_NORMAL(pr64Val))
4032 {
4033 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4034 pFpuRes->r80Result.sj64.fInteger = 1;
4035 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4036 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
4037 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
4038 }
4039 else if (RTFLOAT64U_IS_ZERO(pr64Val))
4040 {
4041 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
4042 pFpuRes->r80Result.s.uExponent = 0;
4043 pFpuRes->r80Result.s.uMantissa = 0;
4044 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
4045 }
4046 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
4047 {
4048 /* Subnormal values gets normalized. */
4049 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4050 pFpuRes->r80Result.sj64.fInteger = 1;
4051 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
4052 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
4053 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
4054 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
4055 pFpuRes->FSW |= X86_FSW_DE;
4056 if (!(pFpuState->FCW & X86_FCW_DM))
4057 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
4058 }
4059 else if (RTFLOAT64U_IS_INF(pr64Val))
4060 {
4061 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
4062 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
4063 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
4064 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
4065 }
4066 else
4067 {
4068 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
4069 Assert(RTFLOAT64U_IS_NAN(pr64Val));
4070 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4071 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
4072 pFpuRes->r80Result.sj64.fInteger = 1;
4073 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4074 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
4075 {
4076 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
4077 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4078 pFpuRes->FSW |= X86_FSW_IE;
4079
4080 if (!(pFpuState->FCW & X86_FCW_IM))
4081 {
4082 /* The value is not pushed. */
4083 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
4084 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
4085 pFpuRes->r80Result.au64[0] = 0;
4086 pFpuRes->r80Result.au16[4] = 0;
4087 }
4088 }
4089 else
4090 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4091 }
4092}
4093
4094
4095IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
4096{
4097 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
4098 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
4099 /* Raises no exceptions. */
4100 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4101}
4102
4103
4104IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4105{
4106 pFpuRes->r80Result.sj64.fSign = 0;
4107 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4108 pFpuRes->r80Result.sj64.fInteger = 1;
4109 pFpuRes->r80Result.sj64.uFraction = 0;
4110
4111 /*
4112 * FPU status word:
4113 * - TOP is irrelevant, but we must match x86 assembly version.
4114 * - C1 is always cleared as we don't have any stack overflows.
4115 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4116 */
4117 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4118}
4119
4120
4121IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4122{
4123 pFpuRes->r80Result.sj64.fSign = 0;
4124 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4125 pFpuRes->r80Result.sj64.fInteger = 1;
4126 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4127 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4128 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4129 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4130}
4131
4132
4133IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4134{
4135 pFpuRes->r80Result.sj64.fSign = 0;
4136 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4137 pFpuRes->r80Result.sj64.fInteger = 1;
4138 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4139 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4140 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4141}
4142
4143
4144IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4145{
4146 pFpuRes->r80Result.sj64.fSign = 0;
4147 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4148 pFpuRes->r80Result.sj64.fInteger = 1;
4149 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4150 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4151 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4152 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4153}
4154
4155
4156IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4157{
4158 pFpuRes->r80Result.sj64.fSign = 0;
4159 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4160 pFpuRes->r80Result.sj64.fInteger = 1;
4161 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4162 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4163 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4164 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4165}
4166
4167
4168IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4169{
4170 pFpuRes->r80Result.sj64.fSign = 0;
4171 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4172 pFpuRes->r80Result.sj64.fInteger = 1;
4173 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4174 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4175 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4176 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4177}
4178
4179
4180IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4181{
4182 pFpuRes->r80Result.s.fSign = 0;
4183 pFpuRes->r80Result.s.uExponent = 0;
4184 pFpuRes->r80Result.s.uMantissa = 0;
4185 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4186}
4187
4188#define EMIT_FILD(a_cBits) \
4189IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4190 int ## a_cBits ## _t const *piVal)) \
4191{ \
4192 int ## a_cBits ## _t iVal = *piVal; \
4193 if (iVal == 0) \
4194 { \
4195 pFpuRes->r80Result.s.fSign = 0; \
4196 pFpuRes->r80Result.s.uExponent = 0; \
4197 pFpuRes->r80Result.s.uMantissa = 0; \
4198 } \
4199 else \
4200 { \
4201 if (iVal > 0) \
4202 pFpuRes->r80Result.s.fSign = 0; \
4203 else \
4204 { \
4205 pFpuRes->r80Result.s.fSign = 1; \
4206 iVal = -iVal; \
4207 } \
4208 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4209 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4210 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4211 } \
4212 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4213}
4214EMIT_FILD(16)
4215EMIT_FILD(32)
4216EMIT_FILD(64)
4217
4218
4219IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4220{
4221 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4222 if ( pd80Val->s.abPairs[0] == 0
4223 && pd80Val->s.abPairs[1] == 0
4224 && pd80Val->s.abPairs[2] == 0
4225 && pd80Val->s.abPairs[3] == 0
4226 && pd80Val->s.abPairs[4] == 0
4227 && pd80Val->s.abPairs[5] == 0
4228 && pd80Val->s.abPairs[6] == 0
4229 && pd80Val->s.abPairs[7] == 0
4230 && pd80Val->s.abPairs[8] == 0)
4231 {
4232 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4233 pFpuRes->r80Result.s.uExponent = 0;
4234 pFpuRes->r80Result.s.uMantissa = 0;
4235 }
4236 else
4237 {
4238 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4239
4240 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4241 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4242 cPairs--;
4243
4244 uint64_t uVal = 0;
4245 uint64_t uFactor = 1;
4246 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4247 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4248 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4249
4250 unsigned const cBits = ASMBitLastSetU64(uVal);
4251 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4252 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4253 }
4254}
4255
4256
4257/*********************************************************************************************************************************
4258* x87 FPU Stores *
4259*********************************************************************************************************************************/
4260
4261/**
4262 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4263 *
4264 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4265 *
4266 * @returns Updated FPU status word value.
4267 * @param fSignIn Incoming sign indicator.
4268 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4269 * @param iExponentIn Unbiased exponent.
4270 * @param fFcw The FPU control word.
4271 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4272 * @param pr32Dst Where to return the output value, if one should be
4273 * returned.
4274 *
4275 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4276 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4277 */
4278static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4279 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4280{
4281 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4282 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4283 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4284 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4285 ? fRoundingOffMask
4286 : 0;
4287 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4288
4289 /*
4290 * Deal with potential overflows/underflows first, optimizing for none.
4291 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4292 */
4293 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4294 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4295 { /* likely? */ }
4296 /*
4297 * Underflow if the exponent zero or negative. This is attempted mapped
4298 * to a subnormal number when possible, with some additional trickery ofc.
4299 */
4300 else if (iExponentOut <= 0)
4301 {
4302 bool const fIsTiny = iExponentOut < 0
4303 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4304 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4305 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4306 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4307
4308 if (iExponentOut <= 0)
4309 {
4310 uMantissaIn = iExponentOut <= -63
4311 ? uMantissaIn != 0
4312 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4313 fRoundedOff = uMantissaIn & fRoundingOffMask;
4314 if (fRoundedOff && fIsTiny)
4315 fFsw |= X86_FSW_UE;
4316 iExponentOut = 0;
4317 }
4318 }
4319 /*
4320 * Overflow if at or above max exponent value or if we will reach max
4321 * when rounding. Will return +/-zero or +/-max value depending on
4322 * whether we're rounding or not.
4323 */
4324 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4325 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4326 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4327 {
4328 fFsw |= X86_FSW_OE;
4329 if (!(fFcw & X86_FCW_OM))
4330 return fFsw | X86_FSW_ES | X86_FSW_B;
4331 fFsw |= X86_FSW_PE;
4332 if (uRoundingAdd)
4333 fFsw |= X86_FSW_C1;
4334 if (!(fFcw & X86_FCW_PM))
4335 fFsw |= X86_FSW_ES | X86_FSW_B;
4336
4337 pr32Dst->s.fSign = fSignIn;
4338 if (uRoundingAdd)
4339 { /* Zero */
4340 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4341 pr32Dst->s.uFraction = 0;
4342 }
4343 else
4344 { /* Max */
4345 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4346 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4347 }
4348 return fFsw;
4349 }
4350
4351 /*
4352 * Normal or subnormal number.
4353 */
4354 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4355 uint64_t uMantissaOut = uMantissaIn;
4356 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4357 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4358 || fRoundedOff != uRoundingAdd)
4359 {
4360 uMantissaOut = uMantissaIn + uRoundingAdd;
4361 if (uMantissaOut >= uMantissaIn)
4362 { /* likely */ }
4363 else
4364 {
4365 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4366 iExponentOut++;
4367 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4368 fFsw |= X86_FSW_C1;
4369 }
4370 }
4371 else
4372 uMantissaOut = uMantissaIn;
4373
4374 /* Truncate the mantissa and set the return value. */
4375 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4376
4377 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4378 pr32Dst->s.uExponent = iExponentOut;
4379 pr32Dst->s.fSign = fSignIn;
4380
4381 /* Set status flags realted to rounding. */
4382 if (fRoundedOff)
4383 {
4384 fFsw |= X86_FSW_PE;
4385 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4386 fFsw |= X86_FSW_C1;
4387 if (!(fFcw & X86_FCW_PM))
4388 fFsw |= X86_FSW_ES | X86_FSW_B;
4389 }
4390
4391 return fFsw;
4392}
4393
4394
4395/**
4396 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4397 */
4398IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4399 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4400{
4401 uint16_t const fFcw = pFpuState->FCW;
4402 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4403 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4404 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4405 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4406 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4407 {
4408 pr32Dst->s.fSign = pr80Src->s.fSign;
4409 pr32Dst->s.uExponent = 0;
4410 pr32Dst->s.uFraction = 0;
4411 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4412 }
4413 else if (RTFLOAT80U_IS_INF(pr80Src))
4414 {
4415 pr32Dst->s.fSign = pr80Src->s.fSign;
4416 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4417 pr32Dst->s.uFraction = 0;
4418 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4419 }
4420 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4421 {
4422 /* Mapped to +/-QNaN */
4423 pr32Dst->s.fSign = pr80Src->s.fSign;
4424 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4425 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4426 }
4427 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4428 {
4429 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4430 if (fFcw & X86_FCW_IM)
4431 {
4432 pr32Dst->s.fSign = 1;
4433 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4434 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4435 fFsw |= X86_FSW_IE;
4436 }
4437 else
4438 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4439 }
4440 else if (RTFLOAT80U_IS_NAN(pr80Src))
4441 {
4442 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4443 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4444 {
4445 pr32Dst->s.fSign = pr80Src->s.fSign;
4446 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4447 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4448 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4449 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4450 fFsw |= X86_FSW_IE;
4451 }
4452 else
4453 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4454 }
4455 else
4456 {
4457 /* Denormal values causes both an underflow and precision exception. */
4458 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4459 if (fFcw & X86_FCW_UM)
4460 {
4461 pr32Dst->s.fSign = pr80Src->s.fSign;
4462 pr32Dst->s.uExponent = 0;
4463 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4464 {
4465 pr32Dst->s.uFraction = 1;
4466 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4467 if (!(fFcw & X86_FCW_PM))
4468 fFsw |= X86_FSW_ES | X86_FSW_B;
4469 }
4470 else
4471 {
4472 pr32Dst->s.uFraction = 0;
4473 fFsw |= X86_FSW_UE | X86_FSW_PE;
4474 if (!(fFcw & X86_FCW_PM))
4475 fFsw |= X86_FSW_ES | X86_FSW_B;
4476 }
4477 }
4478 else
4479 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4480 }
4481 *pu16FSW = fFsw;
4482}
4483
4484
4485/**
4486 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4487 *
4488 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4489 *
4490 * @returns Updated FPU status word value.
4491 * @param fSignIn Incoming sign indicator.
4492 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4493 * @param iExponentIn Unbiased exponent.
4494 * @param fFcw The FPU control word.
4495 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4496 * @param pr64Dst Where to return the output value, if one should be
4497 * returned.
4498 *
4499 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4500 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4501 */
4502static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4503 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4504{
4505 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4506 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4507 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4508 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4509 ? fRoundingOffMask
4510 : 0;
4511 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4512
4513 /*
4514 * Deal with potential overflows/underflows first, optimizing for none.
4515 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4516 */
4517 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4518 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4519 { /* likely? */ }
4520 /*
4521 * Underflow if the exponent zero or negative. This is attempted mapped
4522 * to a subnormal number when possible, with some additional trickery ofc.
4523 */
4524 else if (iExponentOut <= 0)
4525 {
4526 bool const fIsTiny = iExponentOut < 0
4527 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4528 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4529 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4530 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4531
4532 if (iExponentOut <= 0)
4533 {
4534 uMantissaIn = iExponentOut <= -63
4535 ? uMantissaIn != 0
4536 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4537 fRoundedOff = uMantissaIn & fRoundingOffMask;
4538 if (fRoundedOff && fIsTiny)
4539 fFsw |= X86_FSW_UE;
4540 iExponentOut = 0;
4541 }
4542 }
4543 /*
4544 * Overflow if at or above max exponent value or if we will reach max
4545 * when rounding. Will return +/-zero or +/-max value depending on
4546 * whether we're rounding or not.
4547 */
4548 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4549 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4550 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4551 {
4552 fFsw |= X86_FSW_OE;
4553 if (!(fFcw & X86_FCW_OM))
4554 return fFsw | X86_FSW_ES | X86_FSW_B;
4555 fFsw |= X86_FSW_PE;
4556 if (uRoundingAdd)
4557 fFsw |= X86_FSW_C1;
4558 if (!(fFcw & X86_FCW_PM))
4559 fFsw |= X86_FSW_ES | X86_FSW_B;
4560
4561 pr64Dst->s64.fSign = fSignIn;
4562 if (uRoundingAdd)
4563 { /* Zero */
4564 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4565 pr64Dst->s64.uFraction = 0;
4566 }
4567 else
4568 { /* Max */
4569 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4570 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4571 }
4572 return fFsw;
4573 }
4574
4575 /*
4576 * Normal or subnormal number.
4577 */
4578 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4579 uint64_t uMantissaOut = uMantissaIn;
4580 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4581 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4582 || fRoundedOff != uRoundingAdd)
4583 {
4584 uMantissaOut = uMantissaIn + uRoundingAdd;
4585 if (uMantissaOut >= uMantissaIn)
4586 { /* likely */ }
4587 else
4588 {
4589 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4590 iExponentOut++;
4591 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4592 fFsw |= X86_FSW_C1;
4593 }
4594 }
4595 else
4596 uMantissaOut = uMantissaIn;
4597
4598 /* Truncate the mantissa and set the return value. */
4599 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4600
4601 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4602 pr64Dst->s64.uExponent = iExponentOut;
4603 pr64Dst->s64.fSign = fSignIn;
4604
4605 /* Set status flags realted to rounding. */
4606 if (fRoundedOff)
4607 {
4608 fFsw |= X86_FSW_PE;
4609 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4610 fFsw |= X86_FSW_C1;
4611 if (!(fFcw & X86_FCW_PM))
4612 fFsw |= X86_FSW_ES | X86_FSW_B;
4613 }
4614
4615 return fFsw;
4616}
4617
4618
4619/**
4620 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4621 */
4622IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4623 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4624{
4625 uint16_t const fFcw = pFpuState->FCW;
4626 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4627 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4628 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4629 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4630 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4631 {
4632 pr64Dst->s64.fSign = pr80Src->s.fSign;
4633 pr64Dst->s64.uExponent = 0;
4634 pr64Dst->s64.uFraction = 0;
4635 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4636 }
4637 else if (RTFLOAT80U_IS_INF(pr80Src))
4638 {
4639 pr64Dst->s64.fSign = pr80Src->s.fSign;
4640 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4641 pr64Dst->s64.uFraction = 0;
4642 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4643 }
4644 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4645 {
4646 /* Mapped to +/-QNaN */
4647 pr64Dst->s64.fSign = pr80Src->s.fSign;
4648 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4649 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4650 }
4651 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4652 {
4653 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4654 if (fFcw & X86_FCW_IM)
4655 {
4656 pr64Dst->s64.fSign = 1;
4657 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4658 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4659 fFsw |= X86_FSW_IE;
4660 }
4661 else
4662 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4663 }
4664 else if (RTFLOAT80U_IS_NAN(pr80Src))
4665 {
4666 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4667 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4668 {
4669 pr64Dst->s64.fSign = pr80Src->s.fSign;
4670 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4671 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4672 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4673 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4674 fFsw |= X86_FSW_IE;
4675 }
4676 else
4677 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4678 }
4679 else
4680 {
4681 /* Denormal values causes both an underflow and precision exception. */
4682 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4683 if (fFcw & X86_FCW_UM)
4684 {
4685 pr64Dst->s64.fSign = pr80Src->s.fSign;
4686 pr64Dst->s64.uExponent = 0;
4687 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4688 {
4689 pr64Dst->s64.uFraction = 1;
4690 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4691 if (!(fFcw & X86_FCW_PM))
4692 fFsw |= X86_FSW_ES | X86_FSW_B;
4693 }
4694 else
4695 {
4696 pr64Dst->s64.uFraction = 0;
4697 fFsw |= X86_FSW_UE | X86_FSW_PE;
4698 if (!(fFcw & X86_FCW_PM))
4699 fFsw |= X86_FSW_ES | X86_FSW_B;
4700 }
4701 }
4702 else
4703 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4704 }
4705 *pu16FSW = fFsw;
4706}
4707
4708
4709IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4710 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4711{
4712 /*
4713 * FPU status word:
4714 * - TOP is irrelevant, but we must match x86 assembly version (0).
4715 * - C1 is always cleared as we don't have any stack overflows.
4716 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4717 */
4718 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4719 *pr80Dst = *pr80Src;
4720}
4721
4722
4723/*
4724 *
4725 * Mantissa:
4726 * 63 56 48 40 32 24 16 8 0
4727 * v v v v v v v v v
4728 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4729 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4730 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4731 *
4732 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4733 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4734 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4735 * where we'll drop off all but bit 63.
4736 */
4737#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4738IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4739 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4740{ \
4741 uint16_t const fFcw = pFpuState->FCW; \
4742 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4743 bool const fSignIn = pr80Val->s.fSign; \
4744 \
4745 /* \
4746 * Deal with normal numbers first. \
4747 */ \
4748 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4749 { \
4750 uint64_t uMantissa = pr80Val->s.uMantissa; \
4751 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4752 \
4753 if ((uint32_t)iExponent <= a_cBits - 2) \
4754 { \
4755 unsigned const cShiftOff = 63 - iExponent; \
4756 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4757 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4758 ? RT_BIT_64(cShiftOff - 1) \
4759 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4760 ? fRoundingOffMask \
4761 : 0; \
4762 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4763 \
4764 uMantissa >>= cShiftOff; \
4765 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4766 uMantissa += uRounding; \
4767 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4768 { \
4769 if (fRoundedOff) \
4770 { \
4771 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4772 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4773 else if (uRounding) \
4774 fFsw |= X86_FSW_C1; \
4775 fFsw |= X86_FSW_PE; \
4776 if (!(fFcw & X86_FCW_PM)) \
4777 fFsw |= X86_FSW_ES | X86_FSW_B; \
4778 } \
4779 \
4780 if (!fSignIn) \
4781 *piDst = (a_iType)uMantissa; \
4782 else \
4783 *piDst = -(a_iType)uMantissa; \
4784 } \
4785 else \
4786 { \
4787 /* overflowed after rounding. */ \
4788 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4789 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4790 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4791 \
4792 /* Special case for the integer minimum value. */ \
4793 if (fSignIn) \
4794 { \
4795 *piDst = a_iTypeMin; \
4796 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4797 if (!(fFcw & X86_FCW_PM)) \
4798 fFsw |= X86_FSW_ES | X86_FSW_B; \
4799 } \
4800 else \
4801 { \
4802 fFsw |= X86_FSW_IE; \
4803 if (fFcw & X86_FCW_IM) \
4804 *piDst = a_iTypeMin; \
4805 else \
4806 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4807 } \
4808 } \
4809 } \
4810 /* \
4811 * Tiny sub-zero numbers. \
4812 */ \
4813 else if (iExponent < 0) \
4814 { \
4815 if (!fSignIn) \
4816 { \
4817 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4818 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4819 { \
4820 *piDst = 1; \
4821 fFsw |= X86_FSW_C1; \
4822 } \
4823 else \
4824 *piDst = 0; \
4825 } \
4826 else \
4827 { \
4828 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4829 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4830 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4831 *piDst = 0; \
4832 else \
4833 { \
4834 *piDst = -1; \
4835 fFsw |= X86_FSW_C1; \
4836 } \
4837 } \
4838 fFsw |= X86_FSW_PE; \
4839 if (!(fFcw & X86_FCW_PM)) \
4840 fFsw |= X86_FSW_ES | X86_FSW_B; \
4841 } \
4842 /* \
4843 * Special MIN case. \
4844 */ \
4845 else if ( fSignIn && iExponent == a_cBits - 1 \
4846 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4847 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4848 : uMantissa == RT_BIT_64(63))) \
4849 { \
4850 *piDst = a_iTypeMin; \
4851 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4852 { \
4853 fFsw |= X86_FSW_PE; \
4854 if (!(fFcw & X86_FCW_PM)) \
4855 fFsw |= X86_FSW_ES | X86_FSW_B; \
4856 } \
4857 } \
4858 /* \
4859 * Too large/small number outside the target integer range. \
4860 */ \
4861 else \
4862 { \
4863 fFsw |= X86_FSW_IE; \
4864 if (fFcw & X86_FCW_IM) \
4865 *piDst = a_iTypeIndefinite; \
4866 else \
4867 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4868 } \
4869 } \
4870 /* \
4871 * Map both +0 and -0 to integer zero (signless/+). \
4872 */ \
4873 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4874 *piDst = 0; \
4875 /* \
4876 * Denormals are just really tiny sub-zero numbers that are either rounded \
4877 * to zero, 1 or -1 depending on sign and rounding control. \
4878 */ \
4879 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4880 { \
4881 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4882 *piDst = 0; \
4883 else \
4884 { \
4885 *piDst = fSignIn ? -1 : 1; \
4886 fFsw |= X86_FSW_C1; \
4887 } \
4888 fFsw |= X86_FSW_PE; \
4889 if (!(fFcw & X86_FCW_PM)) \
4890 fFsw |= X86_FSW_ES | X86_FSW_B; \
4891 } \
4892 /* \
4893 * All other special values are considered invalid arguments and result \
4894 * in an IE exception and indefinite value if masked. \
4895 */ \
4896 else \
4897 { \
4898 fFsw |= X86_FSW_IE; \
4899 if (fFcw & X86_FCW_IM) \
4900 *piDst = a_iTypeIndefinite; \
4901 else \
4902 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4903 } \
4904 *pu16FSW = fFsw; \
4905}
4906EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4907EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4908EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4909
4910#endif /*IEM_WITHOUT_ASSEMBLY */
4911
4912
4913/*
4914 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4915 *
4916 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4917 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4918 * thus the @a a_cBitsIn.
4919 */
4920#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4921IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4922 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4923{ \
4924 uint16_t const fFcw = pFpuState->FCW; \
4925 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4926 bool const fSignIn = pr80Val->s.fSign; \
4927 \
4928 /* \
4929 * Deal with normal numbers first. \
4930 */ \
4931 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4932 { \
4933 uint64_t uMantissa = pr80Val->s.uMantissa; \
4934 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4935 \
4936 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4937 { \
4938 unsigned const cShiftOff = 63 - iExponent; \
4939 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4940 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4941 uMantissa >>= cShiftOff; \
4942 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4943 if (!fSignIn) \
4944 *piDst = (a_iType)uMantissa; \
4945 else \
4946 *piDst = -(a_iType)uMantissa; \
4947 \
4948 if (fRoundedOff) \
4949 { \
4950 fFsw |= X86_FSW_PE; \
4951 if (!(fFcw & X86_FCW_PM)) \
4952 fFsw |= X86_FSW_ES | X86_FSW_B; \
4953 } \
4954 } \
4955 /* \
4956 * Tiny sub-zero numbers. \
4957 */ \
4958 else if (iExponent < 0) \
4959 { \
4960 *piDst = 0; \
4961 fFsw |= X86_FSW_PE; \
4962 if (!(fFcw & X86_FCW_PM)) \
4963 fFsw |= X86_FSW_ES | X86_FSW_B; \
4964 } \
4965 /* \
4966 * Special MIN case. \
4967 */ \
4968 else if ( fSignIn && iExponent == a_cBits - 1 \
4969 && (a_cBits < 64 \
4970 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4971 : uMantissa == RT_BIT_64(63)) ) \
4972 { \
4973 *piDst = a_iTypeMin; \
4974 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4975 { \
4976 fFsw |= X86_FSW_PE; \
4977 if (!(fFcw & X86_FCW_PM)) \
4978 fFsw |= X86_FSW_ES | X86_FSW_B; \
4979 } \
4980 } \
4981 /* \
4982 * Figure this weirdness. \
4983 */ \
4984 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4985 { \
4986 *piDst = 0; \
4987 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4988 { \
4989 fFsw |= X86_FSW_PE; \
4990 if (!(fFcw & X86_FCW_PM)) \
4991 fFsw |= X86_FSW_ES | X86_FSW_B; \
4992 } \
4993 } \
4994 /* \
4995 * Too large/small number outside the target integer range. \
4996 */ \
4997 else \
4998 { \
4999 fFsw |= X86_FSW_IE; \
5000 if (fFcw & X86_FCW_IM) \
5001 *piDst = a_iTypeIndefinite; \
5002 else \
5003 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
5004 } \
5005 } \
5006 /* \
5007 * Map both +0 and -0 to integer zero (signless/+). \
5008 */ \
5009 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
5010 *piDst = 0; \
5011 /* \
5012 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
5013 */ \
5014 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
5015 { \
5016 *piDst = 0; \
5017 fFsw |= X86_FSW_PE; \
5018 if (!(fFcw & X86_FCW_PM)) \
5019 fFsw |= X86_FSW_ES | X86_FSW_B; \
5020 } \
5021 /* \
5022 * All other special values are considered invalid arguments and result \
5023 * in an IE exception and indefinite value if masked. \
5024 */ \
5025 else \
5026 { \
5027 fFsw |= X86_FSW_IE; \
5028 if (fFcw & X86_FCW_IM) \
5029 *piDst = a_iTypeIndefinite; \
5030 else \
5031 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
5032 } \
5033 *pu16FSW = fFsw; \
5034}
5035#if defined(IEM_WITHOUT_ASSEMBLY)
5036EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
5037EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
5038EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
5039#endif
5040EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
5041EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
5042
5043
5044#if defined(IEM_WITHOUT_ASSEMBLY)
5045
5046IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
5047 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
5048{
5049 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
5050 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
5051 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
5052 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
5053 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
5054
5055 uint16_t const fFcw = pFpuState->FCW;
5056 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
5057 bool const fSignIn = pr80Src->s.fSign;
5058
5059 /*
5060 * Deal with normal numbers first.
5061 */
5062 if (RTFLOAT80U_IS_NORMAL(pr80Src))
5063 {
5064 uint64_t uMantissa = pr80Src->s.uMantissa;
5065 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
5066 if ( (uint32_t)iExponent <= 58
5067 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
5068 {
5069 unsigned const cShiftOff = 63 - iExponent;
5070 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5071 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5072 ? RT_BIT_64(cShiftOff - 1)
5073 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5074 ? fRoundingOffMask
5075 : 0;
5076 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
5077
5078 uMantissa >>= cShiftOff;
5079 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
5080 uMantissa += uRounding;
5081 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
5082 {
5083 if (fRoundedOff)
5084 {
5085 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
5086 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
5087 else if (uRounding)
5088 fFsw |= X86_FSW_C1;
5089 fFsw |= X86_FSW_PE;
5090 if (!(fFcw & X86_FCW_PM))
5091 fFsw |= X86_FSW_ES | X86_FSW_B;
5092 }
5093
5094 pd80Dst->s.fSign = fSignIn;
5095 pd80Dst->s.uPad = 0;
5096 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
5097 {
5098 unsigned const uDigits = uMantissa % 100;
5099 uMantissa /= 100;
5100 uint8_t const bLo = uDigits % 10;
5101 uint8_t const bHi = uDigits / 10;
5102 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
5103 }
5104 }
5105 else
5106 {
5107 /* overflowed after rounding. */
5108 fFsw |= X86_FSW_IE;
5109 if (fFcw & X86_FCW_IM)
5110 *pd80Dst = s_d80Indefinite;
5111 else
5112 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5113 }
5114 }
5115 /*
5116 * Tiny sub-zero numbers.
5117 */
5118 else if (iExponent < 0)
5119 {
5120 if (!fSignIn)
5121 {
5122 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5123 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5124 {
5125 *pd80Dst = s_ad80One[fSignIn];
5126 fFsw |= X86_FSW_C1;
5127 }
5128 else
5129 *pd80Dst = s_ad80Zeros[fSignIn];
5130 }
5131 else
5132 {
5133 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5134 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5135 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5136 *pd80Dst = s_ad80Zeros[fSignIn];
5137 else
5138 {
5139 *pd80Dst = s_ad80One[fSignIn];
5140 fFsw |= X86_FSW_C1;
5141 }
5142 }
5143 fFsw |= X86_FSW_PE;
5144 if (!(fFcw & X86_FCW_PM))
5145 fFsw |= X86_FSW_ES | X86_FSW_B;
5146 }
5147 /*
5148 * Too large/small number outside the target integer range.
5149 */
5150 else
5151 {
5152 fFsw |= X86_FSW_IE;
5153 if (fFcw & X86_FCW_IM)
5154 *pd80Dst = s_d80Indefinite;
5155 else
5156 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5157 }
5158 }
5159 /*
5160 * Map both +0 and -0 to integer zero (signless/+).
5161 */
5162 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5163 *pd80Dst = s_ad80Zeros[fSignIn];
5164 /*
5165 * Denormals are just really tiny sub-zero numbers that are either rounded
5166 * to zero, 1 or -1 depending on sign and rounding control.
5167 */
5168 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5169 {
5170 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5171 *pd80Dst = s_ad80Zeros[fSignIn];
5172 else
5173 {
5174 *pd80Dst = s_ad80One[fSignIn];
5175 fFsw |= X86_FSW_C1;
5176 }
5177 fFsw |= X86_FSW_PE;
5178 if (!(fFcw & X86_FCW_PM))
5179 fFsw |= X86_FSW_ES | X86_FSW_B;
5180 }
5181 /*
5182 * All other special values are considered invalid arguments and result
5183 * in an IE exception and indefinite value if masked.
5184 */
5185 else
5186 {
5187 fFsw |= X86_FSW_IE;
5188 if (fFcw & X86_FCW_IM)
5189 *pd80Dst = s_d80Indefinite;
5190 else
5191 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5192 }
5193 *pu16FSW = fFsw;
5194}
5195
5196
5197/*********************************************************************************************************************************
5198* FPU Helpers *
5199*********************************************************************************************************************************/
5200AssertCompileSize(RTFLOAT128U, 16);
5201AssertCompileSize(RTFLOAT80U, 10);
5202AssertCompileSize(RTFLOAT64U, 8);
5203AssertCompileSize(RTFLOAT32U, 4);
5204
5205/**
5206 * Normalizes a possible pseudo-normal value.
5207 *
5208 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5209 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5210 * i.e. changing uExponent from 0 to 1.
5211 *
5212 * This macro will declare a RTFLOAT80U with the name given by
5213 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5214 * a normalization was performed.
5215 *
5216 * @note This must be applied before calling SoftFloat with a value that couldbe
5217 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5218 * correctly.
5219 */
5220#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5221 RTFLOAT80U a_r80ValNormalized; \
5222 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5223 { \
5224 a_r80ValNormalized = *a_pr80Val; \
5225 a_r80ValNormalized.s.uExponent = 1; \
5226 a_pr80Val = &a_r80ValNormalized; \
5227 } else do {} while (0)
5228
5229#ifdef IEM_WITH_FLOAT128_FOR_FPU
5230
5231DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5232{
5233 int fNew;
5234 switch (fFcw & X86_FCW_RC_MASK)
5235 {
5236 default:
5237 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5238 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5239 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5240 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5241 }
5242 int fOld = fegetround();
5243 fesetround(fNew);
5244 return fOld;
5245}
5246
5247
5248DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5249{
5250 fesetround(fOld);
5251}
5252
5253DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5254{
5255 RT_NOREF(fFcw);
5256 RTFLOAT128U Tmp;
5257 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5258 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5259 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5260 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5261 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5262 {
5263 Assert(Tmp.s.uExponent == 0);
5264 Tmp.s2.uSignAndExponent++;
5265 }
5266 return *(_Float128 *)&Tmp;
5267}
5268
5269
5270DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5271{
5272 RT_NOREF(fFcw);
5273 RTFLOAT128U Tmp;
5274 *(_Float128 *)&Tmp = rd128ValSrc;
5275 ASMCompilerBarrier();
5276 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5277 {
5278 pr80Dst->s.fSign = Tmp.s64.fSign;
5279 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5280 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5281 | Tmp.s64.uFractionLo >> (64 - 15);
5282
5283 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5284 unsigned const cShiftOff = 64 - 15;
5285 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5286 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5287 if (uRoundedOff)
5288 {
5289 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5290 ? RT_BIT_64(cShiftOff - 1)
5291 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5292 ? fRoundingOffMask
5293 : 0;
5294 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5295 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5296 || uRoundedOff != uRoundingAdd)
5297 {
5298 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5299 {
5300 uFraction += 1;
5301 if (!(uFraction & RT_BIT_64(63)))
5302 { /* likely */ }
5303 else
5304 {
5305 uFraction >>= 1;
5306 pr80Dst->s.uExponent++;
5307 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5308 return fFsw;
5309 }
5310 fFsw |= X86_FSW_C1;
5311 }
5312 }
5313 fFsw |= X86_FSW_PE;
5314 if (!(fFcw & X86_FCW_PM))
5315 fFsw |= X86_FSW_ES | X86_FSW_B;
5316 }
5317 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5318 }
5319 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5320 {
5321 pr80Dst->s.fSign = Tmp.s64.fSign;
5322 pr80Dst->s.uExponent = 0;
5323 pr80Dst->s.uMantissa = 0;
5324 }
5325 else if (RTFLOAT128U_IS_INF(&Tmp))
5326 {
5327 pr80Dst->s.fSign = Tmp.s64.fSign;
5328 pr80Dst->s.uExponent = 0;
5329 pr80Dst->s.uMantissa = 0;
5330 }
5331 return fFsw;
5332}
5333
5334
5335#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5336
5337/** Initializer for the SoftFloat state structure. */
5338# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5339 { \
5340 softfloat_tininess_afterRounding, \
5341 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5342 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5343 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5344 : (uint8_t)softfloat_round_minMag, \
5345 0, \
5346 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5347 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5348 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5349 }
5350
5351/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5352# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5353 ( (a_fFsw) \
5354 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5355 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5356 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5357 ? X86_FSW_ES | X86_FSW_B : 0) )
5358
5359
5360DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5361{
5362 RT_NOREF(fFcw);
5363 Assert(cBits > 64);
5364# if 0 /* rounding does not seem to help */
5365 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5366 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5367 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5368 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5369 {
5370 uint64_t uOld = r128.v[0];
5371 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5372 if (r128.v[0] < uOld)
5373 r128.v[1] += 1;
5374 }
5375# else
5376 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5377# endif
5378 return r128;
5379}
5380
5381
5382DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5383{
5384 RT_NOREF(fFcw);
5385 Assert(cBits > 64);
5386# if 0 /* rounding does not seem to help, not even on constants */
5387 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5388 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5389 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5390 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5391 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5392 {
5393 uint64_t uOld = r128.v[0];
5394 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5395 if (r128.v[0] < uOld)
5396 r128.v[1] += 1;
5397 }
5398 return r128;
5399# else
5400 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5401 return r128;
5402# endif
5403}
5404
5405
5406# if 0 /* unused */
5407DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5408{
5409 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5410 return r128;
5411}
5412# endif
5413
5414
5415/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5416DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5417{
5418 extFloat80_t Tmp;
5419 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5420 Tmp.signif = pr80Val->s2.uMantissa;
5421 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5422 return extF80_to_f128(Tmp, &Ignored);
5423}
5424
5425
5426/**
5427 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5428 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5429 *
5430 * This is only a structure format conversion, nothing else.
5431 */
5432DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5433{
5434 extFloat80_t Tmp;
5435 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5436 Tmp.signif = pr80Val->s2.uMantissa;
5437 return Tmp;
5438}
5439
5440
5441/**
5442 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5443 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5444 *
5445 * This is only a structure format conversion, nothing else.
5446 */
5447DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5448{
5449 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5450 pr80Dst->s2.uMantissa = r80XSrc.signif;
5451 return pr80Dst;
5452}
5453
5454
5455DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5456{
5457 RT_NOREF(fFcw);
5458 RTFLOAT128U Tmp;
5459 *(float128_t *)&Tmp = r128Src;
5460 ASMCompilerBarrier();
5461
5462 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5463 {
5464 pr80Dst->s.fSign = Tmp.s64.fSign;
5465 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5466 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5467 | Tmp.s64.uFractionLo >> (64 - 15);
5468
5469 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5470 unsigned const cShiftOff = 64 - 15;
5471 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5472 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5473 if (uRoundedOff)
5474 {
5475 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5476 ? RT_BIT_64(cShiftOff - 1)
5477 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5478 ? fRoundingOffMask
5479 : 0;
5480 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5481 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5482 || uRoundedOff != uRoundingAdd)
5483 {
5484 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5485 {
5486 uFraction += 1;
5487 if (!(uFraction & RT_BIT_64(63)))
5488 { /* likely */ }
5489 else
5490 {
5491 uFraction >>= 1;
5492 pr80Dst->s.uExponent++;
5493 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5494 return fFsw;
5495 }
5496 fFsw |= X86_FSW_C1;
5497 }
5498 }
5499 fFsw |= X86_FSW_PE;
5500 if (!(fFcw & X86_FCW_PM))
5501 fFsw |= X86_FSW_ES | X86_FSW_B;
5502 }
5503
5504 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5505 }
5506 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5507 {
5508 pr80Dst->s.fSign = Tmp.s64.fSign;
5509 pr80Dst->s.uExponent = 0;
5510 pr80Dst->s.uMantissa = 0;
5511 }
5512 else if (RTFLOAT128U_IS_INF(&Tmp))
5513 {
5514 pr80Dst->s.fSign = Tmp.s64.fSign;
5515 pr80Dst->s.uExponent = 0x7fff;
5516 pr80Dst->s.uMantissa = 0;
5517 }
5518 return fFsw;
5519}
5520
5521
5522/**
5523 * Helper for transfering exception and C1 to FSW and setting the result value
5524 * accordingly.
5525 *
5526 * @returns Updated FSW.
5527 * @param pSoftState The SoftFloat state following the operation.
5528 * @param r80XResult The result of the SoftFloat operation.
5529 * @param pr80Result Where to store the result for IEM.
5530 * @param fFcw The FPU control word.
5531 * @param fFsw The FSW before the operation, with necessary bits
5532 * cleared and such.
5533 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5534 * raised.
5535 */
5536DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5537 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5538 PCRTFLOAT80U pr80XcptResult)
5539{
5540 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5541 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5542 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5543 fFsw |= X86_FSW_ES | X86_FSW_B;
5544
5545 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5546 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5547 else
5548 {
5549 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5550 *pr80Result = *pr80XcptResult;
5551 }
5552 return fFsw;
5553}
5554
5555
5556/**
5557 * Helper doing polynomial evaluation using Horner's method.
5558 *
5559 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5560 */
5561float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5562 unsigned cPrecision, softfloat_state_t *pSoftState)
5563{
5564 Assert(cHornerConsts > 1);
5565 size_t i = cHornerConsts - 1;
5566 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5567 while (i-- > 0)
5568 {
5569 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5570 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5571 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5572 }
5573 return r128Result;
5574}
5575
5576#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5577
5578
5579/**
5580 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5581 * mantissa, exponent and sign.
5582 *
5583 * @returns Updated FSW.
5584 * @param pr80Dst Where to return the composed value.
5585 * @param fSign The sign.
5586 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5587 * ignored and should be zero. This will probably be
5588 * modified during normalization and rounding.
5589 * @param iExponent Unbiased exponent.
5590 * @param fFcw The FPU control word.
5591 * @param fFsw The FPU status word.
5592 */
5593static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5594 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5595{
5596 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5597
5598 iExponent += RTFLOAT80U_EXP_BIAS;
5599
5600 /* Do normalization if necessary and possible. */
5601 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5602 {
5603 int cShift = 192 - RTUInt256BitCount(puMantissa);
5604 if (iExponent > cShift)
5605 iExponent -= cShift;
5606 else
5607 {
5608 if (fFcw & X86_FCW_UM)
5609 {
5610 if (iExponent > 0)
5611 cShift = --iExponent;
5612 else
5613 cShift = 0;
5614 }
5615 iExponent -= cShift;
5616 }
5617 RTUInt256AssignShiftLeft(puMantissa, cShift);
5618 }
5619
5620 /* Do rounding. */
5621 uint64_t uMantissa = puMantissa->QWords.qw2;
5622 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5623 {
5624 bool fAdd;
5625 switch (fFcw & X86_FCW_RC_MASK)
5626 {
5627 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5628 case X86_FCW_RC_NEAREST:
5629 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5630 {
5631 if ( (uMantissa & 1)
5632 || puMantissa->QWords.qw0 != 0
5633 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5634 {
5635 fAdd = true;
5636 break;
5637 }
5638 uMantissa &= ~(uint64_t)1;
5639 }
5640 fAdd = false;
5641 break;
5642 case X86_FCW_RC_ZERO:
5643 fAdd = false;
5644 break;
5645 case X86_FCW_RC_UP:
5646 fAdd = !fSign;
5647 break;
5648 case X86_FCW_RC_DOWN:
5649 fAdd = fSign;
5650 break;
5651 }
5652 if (fAdd)
5653 {
5654 uint64_t const uTmp = uMantissa;
5655 uMantissa = uTmp + 1;
5656 if (uMantissa < uTmp)
5657 {
5658 uMantissa >>= 1;
5659 uMantissa |= RT_BIT_64(63);
5660 iExponent++;
5661 }
5662 fFsw |= X86_FSW_C1;
5663 }
5664 fFsw |= X86_FSW_PE;
5665 if (!(fFcw & X86_FCW_PM))
5666 fFsw |= X86_FSW_ES | X86_FSW_B;
5667 }
5668
5669 /* Check for underflow (denormals). */
5670 if (iExponent <= 0)
5671 {
5672 if (fFcw & X86_FCW_UM)
5673 {
5674 if (uMantissa & RT_BIT_64(63))
5675 uMantissa >>= 1;
5676 iExponent = 0;
5677 }
5678 else
5679 {
5680 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5681 fFsw |= X86_FSW_ES | X86_FSW_B;
5682 }
5683 fFsw |= X86_FSW_UE;
5684 }
5685 /* Check for overflow */
5686 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5687 {
5688 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5689 }
5690
5691 /* Compose the result. */
5692 pr80Dst->s.uMantissa = uMantissa;
5693 pr80Dst->s.uExponent = iExponent;
5694 pr80Dst->s.fSign = fSign;
5695 return fFsw;
5696}
5697
5698
5699/**
5700 * See also iemAImpl_fld_r80_from_r32
5701 */
5702static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5703{
5704 uint16_t fFsw = 0;
5705 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5706 {
5707 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5708 pr80Dst->sj64.fInteger = 1;
5709 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5710 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5711 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5712 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5713 }
5714 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5715 {
5716 pr80Dst->s.fSign = pr32Val->s.fSign;
5717 pr80Dst->s.uExponent = 0;
5718 pr80Dst->s.uMantissa = 0;
5719 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5720 }
5721 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5722 {
5723 /* Subnormal -> normalized + X86_FSW_DE return. */
5724 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5725 pr80Dst->sj64.fInteger = 1;
5726 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5727 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5728 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5729 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5730 fFsw = X86_FSW_DE;
5731 }
5732 else if (RTFLOAT32U_IS_INF(pr32Val))
5733 {
5734 pr80Dst->s.fSign = pr32Val->s.fSign;
5735 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5736 pr80Dst->s.uMantissa = RT_BIT_64(63);
5737 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5738 }
5739 else
5740 {
5741 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5742 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5743 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5744 pr80Dst->sj64.fInteger = 1;
5745 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5746 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5747 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5748 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5749 }
5750 return fFsw;
5751}
5752
5753
5754/**
5755 * See also iemAImpl_fld_r80_from_r64
5756 */
5757static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5758{
5759 uint16_t fFsw = 0;
5760 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5761 {
5762 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5763 pr80Dst->sj64.fInteger = 1;
5764 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5765 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5766 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5767 }
5768 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5769 {
5770 pr80Dst->s.fSign = pr64Val->s.fSign;
5771 pr80Dst->s.uExponent = 0;
5772 pr80Dst->s.uMantissa = 0;
5773 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5774 }
5775 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5776 {
5777 /* Subnormal values gets normalized. */
5778 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5779 pr80Dst->sj64.fInteger = 1;
5780 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5781 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5782 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5783 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5784 fFsw = X86_FSW_DE;
5785 }
5786 else if (RTFLOAT64U_IS_INF(pr64Val))
5787 {
5788 pr80Dst->s.fSign = pr64Val->s.fSign;
5789 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5790 pr80Dst->s.uMantissa = RT_BIT_64(63);
5791 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5792 }
5793 else
5794 {
5795 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5796 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5797 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5798 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5799 pr80Dst->sj64.fInteger = 1;
5800 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5801 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5802 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5803 }
5804 return fFsw;
5805}
5806
5807
5808/**
5809 * See also EMIT_FILD.
5810 */
5811#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5812static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5813{ \
5814 if (iVal == 0) \
5815 { \
5816 pr80Dst->s.fSign = 0; \
5817 pr80Dst->s.uExponent = 0; \
5818 pr80Dst->s.uMantissa = 0; \
5819 } \
5820 else \
5821 { \
5822 if (iVal > 0) \
5823 pr80Dst->s.fSign = 0; \
5824 else \
5825 { \
5826 pr80Dst->s.fSign = 1; \
5827 iVal = -iVal; \
5828 } \
5829 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5830 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5831 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5832 } \
5833 return pr80Dst; \
5834}
5835EMIT_CONVERT_IXX_TO_R80(16)
5836EMIT_CONVERT_IXX_TO_R80(32)
5837//EMIT_CONVERT_IXX_TO_R80(64)
5838
5839/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5840#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5841IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5842{ \
5843 RTFLOAT80U r80Val2; \
5844 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5845 Assert(!fFsw || fFsw == X86_FSW_DE); \
5846 if (fFsw) \
5847 { \
5848 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5849 fFsw = 0; \
5850 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5851 { \
5852 pFpuRes->r80Result = *pr80Val1; \
5853 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5854 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5855 return; \
5856 } \
5857 } \
5858 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5859 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5860}
5861
5862/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5863#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5864IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5865{ \
5866 RTFLOAT80U r80Val2; \
5867 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5868 Assert(!fFsw || fFsw == X86_FSW_DE); \
5869 if (fFsw) \
5870 { \
5871 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5872 fFsw = 0; \
5873 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5874 { \
5875 pFpuRes->r80Result = *pr80Val1; \
5876 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5877 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5878 return; \
5879 } \
5880 } \
5881 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5882 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5883}
5884
5885/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5886#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5887IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5888{ \
5889 RTFLOAT80U r80Val2; \
5890 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5891 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5892}
5893
5894/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5895#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5896IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5897{ \
5898 RTFLOAT80U r80Val2; \
5899 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5900 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5901}
5902
5903
5904
5905/*********************************************************************************************************************************
5906* x86 FPU Division Operations *
5907*********************************************************************************************************************************/
5908
5909/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5910static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5911 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5912{
5913 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5914 {
5915 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5916 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5917 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5918 }
5919 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5920 { /* Div by zero. */
5921 if (fFcw & X86_FCW_ZM)
5922 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5923 else
5924 {
5925 *pr80Result = *pr80Val1Org;
5926 fFsw |= X86_FSW_ES | X86_FSW_B;
5927 }
5928 fFsw |= X86_FSW_ZE;
5929 }
5930 else
5931 { /* Invalid operand */
5932 if (fFcw & X86_FCW_IM)
5933 *pr80Result = g_r80Indefinite;
5934 else
5935 {
5936 *pr80Result = *pr80Val1Org;
5937 fFsw |= X86_FSW_ES | X86_FSW_B;
5938 }
5939 fFsw |= X86_FSW_IE;
5940 }
5941 return fFsw;
5942}
5943
5944
5945IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5946 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5947{
5948 uint16_t const fFcw = pFpuState->FCW;
5949 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5950
5951 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5952 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5953 {
5954 if (fFcw & X86_FCW_IM)
5955 pFpuRes->r80Result = g_r80Indefinite;
5956 else
5957 {
5958 pFpuRes->r80Result = *pr80Val1;
5959 fFsw |= X86_FSW_ES | X86_FSW_B;
5960 }
5961 fFsw |= X86_FSW_IE;
5962 }
5963 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5964 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5965 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5966 {
5967 if (fFcw & X86_FCW_DM)
5968 {
5969 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5970 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5971 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5972 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5973 }
5974 else
5975 {
5976 pFpuRes->r80Result = *pr80Val1;
5977 fFsw |= X86_FSW_ES | X86_FSW_B;
5978 }
5979 fFsw |= X86_FSW_DE;
5980 }
5981 /* SoftFloat can handle the rest: */
5982 else
5983 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5984
5985 pFpuRes->FSW = fFsw;
5986}
5987
5988
5989EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5990EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5991EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5992EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5993
5994
5995IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5996 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5997{
5998 uint16_t const fFcw = pFpuState->FCW;
5999 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6000
6001 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6002 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6003 {
6004 if (fFcw & X86_FCW_IM)
6005 pFpuRes->r80Result = g_r80Indefinite;
6006 else
6007 {
6008 pFpuRes->r80Result = *pr80Val1;
6009 fFsw |= X86_FSW_ES | X86_FSW_B;
6010 }
6011 fFsw |= X86_FSW_IE;
6012 }
6013 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
6014 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6015 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
6016 {
6017 if (fFcw & X86_FCW_DM)
6018 {
6019 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6020 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6021 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6022 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6023 }
6024 else
6025 {
6026 pFpuRes->r80Result = *pr80Val1;
6027 fFsw |= X86_FSW_ES | X86_FSW_B;
6028 }
6029 fFsw |= X86_FSW_DE;
6030 }
6031 /* SoftFloat can handle the rest: */
6032 else
6033 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6034
6035 pFpuRes->FSW = fFsw;
6036}
6037
6038
6039EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
6040EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
6041EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
6042EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
6043
6044
6045/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
6046static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6047 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
6048{
6049 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
6050 {
6051 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6052 uint16_t fCxFlags = 0;
6053 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
6054 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
6055 &fCxFlags, &SoftState);
6056 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
6057 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6058 if ( !(fFsw & X86_FSW_IE)
6059 && !RTFLOAT80U_IS_NAN(pr80Result)
6060 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
6061 {
6062 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
6063 fFsw |= fCxFlags & X86_FSW_C_MASK;
6064 }
6065 return fFsw;
6066 }
6067
6068 /* Invalid operand */
6069 if (fFcw & X86_FCW_IM)
6070 *pr80Result = g_r80Indefinite;
6071 else
6072 {
6073 *pr80Result = *pr80Val1Org;
6074 fFsw |= X86_FSW_ES | X86_FSW_B;
6075 }
6076 return fFsw | X86_FSW_IE;
6077}
6078
6079
6080static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6081 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
6082{
6083 uint16_t const fFcw = pFpuState->FCW;
6084 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6085
6086 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
6087 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
6088 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
6089 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
6090 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
6091 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
6092 {
6093 if (fFcw & X86_FCW_IM)
6094 pFpuRes->r80Result = g_r80Indefinite;
6095 else
6096 {
6097 pFpuRes->r80Result = *pr80Val1;
6098 fFsw |= X86_FSW_ES | X86_FSW_B;
6099 }
6100 fFsw |= X86_FSW_IE;
6101 }
6102 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
6103 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
6104 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
6105 {
6106 if (fFcw & X86_FCW_DM)
6107 {
6108 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6109 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6110 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6111 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6112 pr80Val1Org, fLegacyInstr);
6113 }
6114 else
6115 {
6116 pFpuRes->r80Result = *pr80Val1;
6117 fFsw |= X86_FSW_ES | X86_FSW_B;
6118 }
6119 fFsw |= X86_FSW_DE;
6120 }
6121 /* SoftFloat can handle the rest: */
6122 else
6123 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6124 pr80Val1, fLegacyInstr);
6125
6126 pFpuRes->FSW = fFsw;
6127}
6128
6129
6130IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6131 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6132{
6133 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6134}
6135
6136
6137IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6138 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6139{
6140 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6141}
6142
6143
6144/*********************************************************************************************************************************
6145* x87 FPU Multiplication Operations *
6146*********************************************************************************************************************************/
6147
6148/** Worker for iemAImpl_fmul_r80_by_r80. */
6149static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6150 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6151{
6152 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6153 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6154 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6155}
6156
6157
6158IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6159 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6160{
6161 uint16_t const fFcw = pFpuState->FCW;
6162 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6163
6164 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6165 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6166 {
6167 if (fFcw & X86_FCW_IM)
6168 pFpuRes->r80Result = g_r80Indefinite;
6169 else
6170 {
6171 pFpuRes->r80Result = *pr80Val1;
6172 fFsw |= X86_FSW_ES | X86_FSW_B;
6173 }
6174 fFsw |= X86_FSW_IE;
6175 }
6176 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6177 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6178 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6179 {
6180 if (fFcw & X86_FCW_DM)
6181 {
6182 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6183 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6184 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6185 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6186 }
6187 else
6188 {
6189 pFpuRes->r80Result = *pr80Val1;
6190 fFsw |= X86_FSW_ES | X86_FSW_B;
6191 }
6192 fFsw |= X86_FSW_DE;
6193 }
6194 /* SoftFloat can handle the rest: */
6195 else
6196 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6197
6198 pFpuRes->FSW = fFsw;
6199}
6200
6201
6202EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6203EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6204EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6205EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6206
6207
6208/*********************************************************************************************************************************
6209* x87 FPU Addition *
6210*********************************************************************************************************************************/
6211
6212/** Worker for iemAImpl_fadd_r80_by_r80. */
6213static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6214 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6215{
6216 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6217 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6218 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6219}
6220
6221
6222IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6223 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6224{
6225 uint16_t const fFcw = pFpuState->FCW;
6226 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6227
6228 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6229 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6230 {
6231 if (fFcw & X86_FCW_IM)
6232 pFpuRes->r80Result = g_r80Indefinite;
6233 else
6234 {
6235 pFpuRes->r80Result = *pr80Val1;
6236 fFsw |= X86_FSW_ES | X86_FSW_B;
6237 }
6238 fFsw |= X86_FSW_IE;
6239 }
6240 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6241 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6242 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6243 {
6244 if (fFcw & X86_FCW_DM)
6245 {
6246 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6247 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6248 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6249 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6250 }
6251 else
6252 {
6253 pFpuRes->r80Result = *pr80Val1;
6254 fFsw |= X86_FSW_ES | X86_FSW_B;
6255 }
6256 fFsw |= X86_FSW_DE;
6257 }
6258 /* SoftFloat can handle the rest: */
6259 else
6260 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6261
6262 pFpuRes->FSW = fFsw;
6263}
6264
6265
6266EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6267EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6268EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6269EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6270
6271
6272/*********************************************************************************************************************************
6273* x87 FPU Subtraction *
6274*********************************************************************************************************************************/
6275
6276/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6277static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6278 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6279{
6280 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6281 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6282 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6283}
6284
6285
6286IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6287 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6288{
6289 uint16_t const fFcw = pFpuState->FCW;
6290 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6291
6292 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6293 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6294 {
6295 if (fFcw & X86_FCW_IM)
6296 pFpuRes->r80Result = g_r80Indefinite;
6297 else
6298 {
6299 pFpuRes->r80Result = *pr80Val1;
6300 fFsw |= X86_FSW_ES | X86_FSW_B;
6301 }
6302 fFsw |= X86_FSW_IE;
6303 }
6304 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6305 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6306 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6307 {
6308 if (fFcw & X86_FCW_DM)
6309 {
6310 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6311 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6312 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6313 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6314 }
6315 else
6316 {
6317 pFpuRes->r80Result = *pr80Val1;
6318 fFsw |= X86_FSW_ES | X86_FSW_B;
6319 }
6320 fFsw |= X86_FSW_DE;
6321 }
6322 /* SoftFloat can handle the rest: */
6323 else
6324 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6325
6326 pFpuRes->FSW = fFsw;
6327}
6328
6329
6330EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6331EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6332EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6333EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6334
6335
6336/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6337IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6338 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6339{
6340 uint16_t const fFcw = pFpuState->FCW;
6341 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6342
6343 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6344 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6345 {
6346 if (fFcw & X86_FCW_IM)
6347 pFpuRes->r80Result = g_r80Indefinite;
6348 else
6349 {
6350 pFpuRes->r80Result = *pr80Val1;
6351 fFsw |= X86_FSW_ES | X86_FSW_B;
6352 }
6353 fFsw |= X86_FSW_IE;
6354 }
6355 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6356 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6357 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6358 {
6359 if (fFcw & X86_FCW_DM)
6360 {
6361 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6362 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6363 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6364 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6365 }
6366 else
6367 {
6368 pFpuRes->r80Result = *pr80Val1;
6369 fFsw |= X86_FSW_ES | X86_FSW_B;
6370 }
6371 fFsw |= X86_FSW_DE;
6372 }
6373 /* SoftFloat can handle the rest: */
6374 else
6375 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6376
6377 pFpuRes->FSW = fFsw;
6378}
6379
6380
6381EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6382EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6383EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6384EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6385
6386
6387/*********************************************************************************************************************************
6388* x87 FPU Trigometric Operations *
6389*********************************************************************************************************************************/
6390static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6391{
6392 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6393 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6394 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6395 extFloat80_t v;
6396 (void)fFcw;
6397
6398 v = extF80_atan2(y, x, &SoftState);
6399
6400 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6401 return fFsw;
6402}
6403
6404IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6405 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6406{
6407 uint16_t const fFcw = pFpuState->FCW;
6408 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6409
6410 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6411 {
6412 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6413
6414 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6415 if (!(fFcw & X86_FCW_PM))
6416 fFsw |= X86_FSW_ES | X86_FSW_B;
6417 }
6418 else
6419 {
6420 fFsw |= X86_FSW_IE;
6421 if (!(fFcw & X86_FCW_IM))
6422 {
6423 pFpuRes->r80Result = *pr80Val2;
6424 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6425 }
6426 else
6427 {
6428 pFpuRes->r80Result = g_r80Indefinite;
6429 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6430 }
6431 }
6432
6433 pFpuRes->FSW = fFsw;
6434}
6435#endif /* IEM_WITHOUT_ASSEMBLY */
6436
6437IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6438 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6439{
6440 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6441}
6442
6443IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6444 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6445{
6446 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6447}
6448
6449
6450#if defined(IEM_WITHOUT_ASSEMBLY)
6451static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6452{
6453 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6454 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6455 extFloat80_t v;
6456 (void)fFcw;
6457
6458 v = extF80_tan(x, &SoftState);
6459
6460 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6461 return fFsw;
6462}
6463
6464IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6465{
6466 uint16_t const fFcw = pFpuState->FCW;
6467 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6468
6469 if (RTFLOAT80U_IS_ZERO(pr80Val))
6470 {
6471 pFpuResTwo->r80Result1 = *pr80Val;
6472 pFpuResTwo->r80Result2 = g_ar80One[0];
6473 }
6474 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6475 {
6476 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6477 {
6478 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6479 pFpuResTwo->r80Result1 = *pr80Val;
6480 }
6481 else
6482 {
6483 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6484 {
6485 pFpuResTwo->r80Result1 = *pr80Val;
6486 }
6487 else
6488 {
6489 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6490 }
6491
6492 pFpuResTwo->r80Result2 = g_ar80One[0];
6493
6494 fFsw |= X86_FSW_PE;
6495 if (!(fFcw & X86_FCW_PM))
6496 fFsw |= X86_FSW_ES | X86_FSW_B;
6497 }
6498 }
6499 else
6500 {
6501 fFsw |= X86_FSW_IE;
6502 if (!(fFcw & X86_FCW_IM))
6503 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6504 }
6505
6506 pFpuResTwo->FSW = fFsw;
6507}
6508#endif /* IEM_WITHOUT_ASSEMBLY */
6509
6510IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6511{
6512 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6513}
6514
6515IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6516{
6517 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6518}
6519
6520#ifdef IEM_WITHOUT_ASSEMBLY
6521
6522static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6523{
6524 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6525 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6526 extFloat80_t v;
6527 (void)fFcw;
6528
6529 v = extF80_sin(x, &SoftState);
6530
6531 iemFpuSoftF80ToIprt(pr80Result, v);
6532
6533 return fFsw;
6534}
6535
6536IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6537{
6538 uint16_t const fFcw = pFpuState->FCW;
6539 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6540
6541 if (RTFLOAT80U_IS_ZERO(pr80Val))
6542 {
6543 pFpuRes->r80Result = *pr80Val;
6544 }
6545 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6546 {
6547 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6548 {
6549 fFsw |= X86_FSW_C2;
6550 pFpuRes->r80Result = *pr80Val;
6551 }
6552 else
6553 {
6554 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6555 {
6556 pFpuRes->r80Result = *pr80Val;
6557 }
6558 else
6559 {
6560 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6561 }
6562 fFsw |= X86_FSW_PE;
6563 if (!(fFcw & X86_FCW_PM))
6564 fFsw |= X86_FSW_ES | X86_FSW_B;
6565 }
6566 }
6567 else if (RTFLOAT80U_IS_INF(pr80Val))
6568 {
6569 fFsw |= X86_FSW_IE;
6570 if (!(fFcw & X86_FCW_IM))
6571 {
6572 fFsw |= X86_FSW_ES | X86_FSW_B;
6573 pFpuRes->r80Result = *pr80Val;
6574 }
6575 else
6576 {
6577 pFpuRes->r80Result = g_r80Indefinite;
6578 }
6579 }
6580 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6581 {
6582 fFsw |= X86_FSW_DE;
6583
6584 if (fFcw & X86_FCW_DM)
6585 {
6586 if (fFcw & X86_FCW_UM)
6587 {
6588 pFpuRes->r80Result = *pr80Val;
6589 }
6590 else
6591 {
6592 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6593 uint64_t uMantissa = pr80Val->s.uMantissa;
6594 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6595
6596 uExponent = 64 - uExponent;
6597 uMantissa <<= uExponent;
6598 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6599
6600 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6601 pFpuRes->r80Result.s.uMantissa = uMantissa;
6602 pFpuRes->r80Result.s.uExponent = uExponent;
6603 }
6604
6605 fFsw |= X86_FSW_UE | X86_FSW_PE;
6606
6607 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6608 {
6609 /* All the exceptions are masked. */
6610 }
6611 else
6612 {
6613 fFsw |= X86_FSW_ES | X86_FSW_B;
6614 }
6615 }
6616 else
6617 {
6618 pFpuRes->r80Result = *pr80Val;
6619
6620 fFsw |= X86_FSW_ES | X86_FSW_B;
6621 }
6622 }
6623 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6624 {
6625 pFpuRes->r80Result = *pr80Val;
6626 fFsw |= X86_FSW_DE;
6627
6628 if (fFcw & X86_FCW_DM)
6629 {
6630 if (fFcw & X86_FCW_PM)
6631 {
6632 fFsw |= X86_FSW_PE;
6633 }
6634 else
6635 {
6636 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6637 }
6638
6639 pFpuRes->r80Result.sj64.uExponent = 1;
6640 }
6641 else
6642 {
6643 fFsw |= X86_FSW_ES | X86_FSW_B;
6644 }
6645 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6646 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6647 {
6648 pFpuRes->r80Result = *pr80Val;
6649 } else {
6650 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6651 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6652 && (fFcw & X86_FCW_IM))
6653 pFpuRes->r80Result = g_r80Indefinite;
6654 else
6655 {
6656 pFpuRes->r80Result = *pr80Val;
6657 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6658 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6659 }
6660
6661 fFsw |= X86_FSW_IE;
6662 if (!(fFcw & X86_FCW_IM))
6663 fFsw |= X86_FSW_ES | X86_FSW_B;
6664 }
6665
6666 pFpuRes->FSW = fFsw;
6667}
6668#endif /* IEM_WITHOUT_ASSEMBLY */
6669
6670IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6671{
6672 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6673}
6674
6675IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6676{
6677 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6678}
6679
6680#ifdef IEM_WITHOUT_ASSEMBLY
6681
6682static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6683{
6684 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6685 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6686 extFloat80_t v;
6687 (void)fFcw;
6688
6689 v = extF80_cos(x, &SoftState);
6690
6691 iemFpuSoftF80ToIprt(pr80Result, v);
6692
6693 return fFsw;
6694}
6695
6696IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6697{
6698 uint16_t const fFcw = pFpuState->FCW;
6699 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6700
6701 if (RTFLOAT80U_IS_ZERO(pr80Val))
6702 {
6703 pFpuRes->r80Result = g_ar80One[0];
6704 }
6705 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6706 {
6707 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6708 {
6709 fFsw |= X86_FSW_C2;
6710 pFpuRes->r80Result = *pr80Val;
6711 }
6712 else
6713 {
6714 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6715 {
6716 pFpuRes->r80Result = g_ar80One[0];
6717
6718 }
6719 else
6720 {
6721 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6722 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6723 }
6724 fFsw |= X86_FSW_PE;
6725 if (!(fFcw & X86_FCW_PM))
6726 fFsw |= X86_FSW_ES | X86_FSW_B;
6727 }
6728 }
6729 else if (RTFLOAT80U_IS_INF(pr80Val))
6730 {
6731 fFsw |= X86_FSW_IE;
6732 if (!(fFcw & X86_FCW_IM))
6733 {
6734 fFsw |= X86_FSW_ES | X86_FSW_B;
6735 pFpuRes->r80Result = *pr80Val;
6736 }
6737 else
6738 {
6739 pFpuRes->r80Result = g_r80Indefinite;
6740 }
6741 }
6742 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6743 {
6744 fFsw |= X86_FSW_DE;
6745
6746 if (fFcw & X86_FCW_DM)
6747 {
6748 pFpuRes->r80Result = g_ar80One[0];
6749
6750 if (fFcw & X86_FCW_PM)
6751 {
6752 fFsw |= X86_FSW_PE;
6753 }
6754 else
6755 {
6756 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6757 }
6758 }
6759 else
6760 {
6761 pFpuRes->r80Result = *pr80Val;
6762 fFsw |= X86_FSW_ES | X86_FSW_B;
6763 }
6764 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6765 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6766 {
6767 pFpuRes->r80Result = *pr80Val;
6768 } else {
6769 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6770 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6771 && (fFcw & X86_FCW_IM))
6772 pFpuRes->r80Result = g_r80Indefinite;
6773 else
6774 {
6775 pFpuRes->r80Result = *pr80Val;
6776 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6777 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6778 }
6779
6780 fFsw |= X86_FSW_IE;
6781 if (!(fFcw & X86_FCW_IM))
6782 fFsw |= X86_FSW_ES | X86_FSW_B;
6783 }
6784
6785 pFpuRes->FSW = fFsw;
6786}
6787#endif /* IEM_WITHOUT_ASSEMBLY */
6788
6789IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6790{
6791 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6792}
6793
6794IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6795{
6796 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6797}
6798
6799#ifdef IEM_WITHOUT_ASSEMBLY
6800
6801static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6802{
6803 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6804 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6805 extFloat80_t r80Sin, r80Cos;
6806 (void)fFcw;
6807
6808 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6809
6810 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6811 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6812
6813 return fFsw;
6814}
6815
6816IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6817{
6818 uint16_t const fFcw = pFpuState->FCW;
6819 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6820
6821 if (RTFLOAT80U_IS_ZERO(pr80Val))
6822 {
6823 pFpuResTwo->r80Result1 = *pr80Val;
6824 pFpuResTwo->r80Result2 = g_ar80One[0];
6825 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6826 }
6827 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6828 {
6829 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6830 {
6831 fFsw |= X86_FSW_C2;
6832
6833 if (fFcw & X86_FCW_IM)
6834 {
6835 pFpuResTwo->r80Result1 = g_r80Indefinite;
6836 }
6837 else
6838 {
6839 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6840 }
6841
6842 pFpuResTwo->r80Result2 = *pr80Val;
6843 }
6844 else
6845 {
6846 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6847
6848 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6849 {
6850 pFpuResTwo->r80Result1 = *pr80Val;
6851 pFpuResTwo->r80Result2 = g_ar80One[0];
6852 }
6853 else
6854 {
6855 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6856 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6857 }
6858 fFsw |= X86_FSW_PE;
6859 if (!(fFcw & X86_FCW_PM))
6860 fFsw |= X86_FSW_ES | X86_FSW_B;
6861 }
6862 }
6863 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6864 {
6865 fFsw |= X86_FSW_DE;
6866
6867 if (fFcw & X86_FCW_DM)
6868 {
6869 pFpuResTwo->r80Result1 = *pr80Val;
6870 pFpuResTwo->r80Result2 = g_ar80One[0];
6871 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6872
6873 if (fFcw & X86_FCW_PM)
6874 {
6875 fFsw |= X86_FSW_PE;
6876 }
6877 else
6878 {
6879 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6880 }
6881
6882 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6883 }
6884 else
6885 {
6886 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6887 pFpuResTwo->r80Result2 = *pr80Val;
6888 fFsw |= X86_FSW_ES | X86_FSW_B;
6889 }
6890 }
6891 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6892 {
6893 fFsw |= X86_FSW_DE;
6894
6895 if (fFcw & X86_FCW_DM)
6896 {
6897 pFpuResTwo->r80Result2 = g_ar80One[0];
6898
6899 if (fFcw & X86_FCW_UM)
6900 {
6901 pFpuResTwo->r80Result1 = *pr80Val;
6902 }
6903 else
6904 {
6905 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6906 uint64_t uMantissa = pr80Val->s.uMantissa;
6907 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6908
6909 uExponent = 64 - uExponent;
6910 uMantissa <<= uExponent;
6911 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6912
6913 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6914 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6915 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6916 }
6917
6918 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6919 fFsw |= X86_FSW_UE | X86_FSW_PE;
6920
6921 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6922 {
6923 /* All the exceptions are masked. */
6924 }
6925 else
6926 {
6927 fFsw |= X86_FSW_ES | X86_FSW_B;
6928 }
6929 }
6930 else
6931 {
6932 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6933 pFpuResTwo->r80Result2 = *pr80Val;
6934 fFsw |= X86_FSW_ES | X86_FSW_B;
6935 }
6936 }
6937 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6938 {
6939 pFpuResTwo->r80Result1 = *pr80Val;
6940 pFpuResTwo->r80Result2 = *pr80Val;
6941 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6942 }
6943 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6944 {
6945 if (fFcw & X86_FCW_IM)
6946 {
6947 pFpuResTwo->r80Result1 = g_r80Indefinite;
6948 pFpuResTwo->r80Result2 = g_r80Indefinite;
6949 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6950 }
6951 else
6952 {
6953 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6954 pFpuResTwo->r80Result2 = *pr80Val;
6955 }
6956
6957 fFsw |= X86_FSW_IE;
6958 if (!(fFcw & X86_FCW_IM))
6959 fFsw |= X86_FSW_ES | X86_FSW_B;
6960 }
6961 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6962 {
6963 pFpuResTwo->r80Result1 = *pr80Val;
6964 pFpuResTwo->r80Result2 = *pr80Val;
6965
6966 if (fFcw & X86_FCW_IM)
6967 {
6968 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6969 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6970 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6971 }
6972 else
6973 {
6974 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6975 pFpuResTwo->r80Result2 = *pr80Val;
6976 }
6977
6978 fFsw |= X86_FSW_IE;
6979 if (!(fFcw & X86_FCW_IM))
6980 fFsw |= X86_FSW_ES | X86_FSW_B;
6981 }
6982 else if (RTFLOAT80U_IS_INF(pr80Val))
6983 {
6984 if (fFcw & X86_FCW_IM)
6985 {
6986 pFpuResTwo->r80Result1 = g_r80Indefinite;
6987 pFpuResTwo->r80Result2 = g_r80Indefinite;
6988 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6989 }
6990 else
6991 {
6992 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6993 pFpuResTwo->r80Result2 = *pr80Val;
6994 }
6995
6996 fFsw |= X86_FSW_IE;
6997 if (!(fFcw & X86_FCW_IM))
6998 fFsw |= X86_FSW_ES | X86_FSW_B;
6999 }
7000
7001 pFpuResTwo->FSW = fFsw;
7002}
7003#endif /* IEM_WITHOUT_ASSEMBLY */
7004
7005IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7006{
7007 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
7008}
7009
7010IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7011{
7012 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
7013}
7014
7015#ifdef IEM_WITHOUT_ASSEMBLY
7016
7017
7018/*********************************************************************************************************************************
7019* x87 FPU Compare and Testing Operations *
7020*********************************************************************************************************************************/
7021
7022IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
7023{
7024 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
7025
7026 if (RTFLOAT80U_IS_ZERO(pr80Val))
7027 fFsw |= X86_FSW_C3;
7028 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
7029 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
7030 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7031 {
7032 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
7033 if (!(pFpuState->FCW & X86_FCW_DM))
7034 fFsw |= X86_FSW_ES | X86_FSW_B;
7035 }
7036 else
7037 {
7038 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
7039 if (!(pFpuState->FCW & X86_FCW_IM))
7040 fFsw |= X86_FSW_ES | X86_FSW_B;
7041 }
7042
7043 *pu16Fsw = fFsw;
7044}
7045
7046
7047IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
7048{
7049 RT_NOREF(pFpuState);
7050 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
7051
7052 /* C1 = sign bit (always, even if empty Intel says). */
7053 if (pr80Val->s.fSign)
7054 fFsw |= X86_FSW_C1;
7055
7056 /* Classify the value in C0, C2, C3. */
7057 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
7058 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
7059 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
7060 fFsw |= X86_FSW_C2;
7061 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7062 fFsw |= X86_FSW_C3;
7063 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
7064 fFsw |= X86_FSW_C0;
7065 else if (RTFLOAT80U_IS_INF(pr80Val))
7066 fFsw |= X86_FSW_C0 | X86_FSW_C2;
7067 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7068 fFsw |= X86_FSW_C2 | X86_FSW_C3;
7069 /* whatever else: 0 */
7070
7071 *pu16Fsw = fFsw;
7072}
7073
7074
7075/**
7076 * Worker for fcom, fucom, and friends.
7077 */
7078static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7079 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
7080{
7081 /*
7082 * Unpack the values.
7083 */
7084 bool const fSign1 = pr80Val1->s.fSign;
7085 int32_t iExponent1 = pr80Val1->s.uExponent;
7086 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
7087
7088 bool const fSign2 = pr80Val2->s.fSign;
7089 int32_t iExponent2 = pr80Val2->s.uExponent;
7090 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
7091
7092 /*
7093 * Check for invalid inputs.
7094 */
7095 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
7096 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
7097 {
7098 if (!(fFcw & X86_FCW_IM))
7099 fFsw |= X86_FSW_ES | X86_FSW_B;
7100 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
7101 }
7102
7103 /*
7104 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
7105 */
7106 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7107 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7108 {
7109 if ( fIeOnAllNaNs
7110 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7111 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7112 {
7113 fFsw |= X86_FSW_IE;
7114 if (!(fFcw & X86_FCW_IM))
7115 fFsw |= X86_FSW_ES | X86_FSW_B;
7116 }
7117 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
7118 }
7119
7120 /*
7121 * Normalize the values.
7122 */
7123 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7124 {
7125 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7126 iExponent1 = 1;
7127 else
7128 {
7129 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7130 uMantissa1 <<= iExponent1;
7131 iExponent1 = 1 - iExponent1;
7132 }
7133 fFsw |= X86_FSW_DE;
7134 if (!(fFcw & X86_FCW_DM))
7135 fFsw |= X86_FSW_ES | X86_FSW_B;
7136 }
7137
7138 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7139 {
7140 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7141 iExponent2 = 1;
7142 else
7143 {
7144 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7145 uMantissa2 <<= iExponent2;
7146 iExponent2 = 1 - iExponent2;
7147 }
7148 fFsw |= X86_FSW_DE;
7149 if (!(fFcw & X86_FCW_DM))
7150 fFsw |= X86_FSW_ES | X86_FSW_B;
7151 }
7152
7153 /*
7154 * Test if equal (val1 == val2):
7155 */
7156 if ( uMantissa1 == uMantissa2
7157 && iExponent1 == iExponent2
7158 && ( fSign1 == fSign2
7159 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7160 fFsw |= X86_FSW_C3;
7161 /*
7162 * Test if less than (val1 < val2):
7163 */
7164 else if (fSign1 && !fSign2)
7165 fFsw |= X86_FSW_C0;
7166 else if (fSign1 == fSign2)
7167 {
7168 /* Zeros are problematic, however at the most one can be zero here. */
7169 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7170 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7171 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7172 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7173
7174 if ( fSign1
7175 ^ ( iExponent1 < iExponent2
7176 || ( iExponent1 == iExponent2
7177 && uMantissa1 < uMantissa2 ) ) )
7178 fFsw |= X86_FSW_C0;
7179 }
7180 /* else: No flags set if greater. */
7181
7182 return fFsw;
7183}
7184
7185
7186IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7187 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7188{
7189 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7190}
7191
7192
7193
7194
7195IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7196 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7197{
7198 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7199}
7200
7201
7202IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7203 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7204{
7205 RTFLOAT80U r80Val2;
7206 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7207 Assert(!fFsw || fFsw == X86_FSW_DE);
7208 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7209 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7210 {
7211 if (!(pFpuState->FCW & X86_FCW_DM))
7212 fFsw |= X86_FSW_ES | X86_FSW_B;
7213 *pfFsw |= fFsw;
7214 }
7215}
7216
7217
7218IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7219 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7220{
7221 RTFLOAT80U r80Val2;
7222 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7223 Assert(!fFsw || fFsw == X86_FSW_DE);
7224 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7225 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7226 {
7227 if (!(pFpuState->FCW & X86_FCW_DM))
7228 fFsw |= X86_FSW_ES | X86_FSW_B;
7229 *pfFsw |= fFsw;
7230 }
7231}
7232
7233
7234IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7235 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7236{
7237 RTFLOAT80U r80Val2;
7238 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7239 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7240}
7241
7242
7243IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7244 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7245{
7246 RTFLOAT80U r80Val2;
7247 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7248 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7249}
7250
7251
7252/**
7253 * Worker for fcomi & fucomi.
7254 */
7255static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7256 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7257{
7258 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7259 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7260 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7261 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7262
7263 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7264 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7265 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7266}
7267
7268
7269IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7270 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7271{
7272 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7273}
7274
7275
7276IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7277 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7278{
7279 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7280}
7281
7282
7283/*********************************************************************************************************************************
7284* x87 FPU Other Operations *
7285*********************************************************************************************************************************/
7286
7287/**
7288 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7289 */
7290static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7291{
7292 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7293 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7294 true /*exact / generate #PE */, &SoftState));
7295 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7296}
7297
7298
7299IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7300{
7301 uint16_t const fFcw = pFpuState->FCW;
7302 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7303
7304 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7305 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7306 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7307 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7308 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7309 || RTFLOAT80U_IS_INF(pr80Val))
7310 pFpuRes->r80Result = *pr80Val;
7311 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7312 {
7313 fFsw |= X86_FSW_DE;
7314 if (fFcw & X86_FCW_DM)
7315 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7316 else
7317 {
7318 pFpuRes->r80Result = *pr80Val;
7319 fFsw |= X86_FSW_ES | X86_FSW_B;
7320 }
7321 }
7322 else
7323 {
7324 if (fFcw & X86_FCW_IM)
7325 {
7326 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7327 pFpuRes->r80Result = g_r80Indefinite;
7328 else
7329 {
7330 pFpuRes->r80Result = *pr80Val;
7331 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7332 }
7333 }
7334 else
7335 {
7336 pFpuRes->r80Result = *pr80Val;
7337 fFsw |= X86_FSW_ES | X86_FSW_B;
7338 }
7339 fFsw |= X86_FSW_IE;
7340 }
7341 pFpuRes->FSW = fFsw;
7342}
7343
7344
7345IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7346 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7347{
7348 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7349 it does everything we need it to do. */
7350 uint16_t const fFcw = pFpuState->FCW;
7351 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7352 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7353 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7354 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7355}
7356
7357
7358/**
7359 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7360 */
7361static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7362{
7363 Assert(!pr80Val->s.fSign);
7364 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7365 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7366 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7367}
7368
7369
7370IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7371{
7372 uint16_t const fFcw = pFpuState->FCW;
7373 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7374
7375 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7376 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7377 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7378 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7379 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7380 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7381 pFpuRes->r80Result = *pr80Val;
7382 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7383 {
7384 fFsw |= X86_FSW_DE;
7385 if (fFcw & X86_FCW_DM)
7386 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7387 else
7388 {
7389 pFpuRes->r80Result = *pr80Val;
7390 fFsw |= X86_FSW_ES | X86_FSW_B;
7391 }
7392 }
7393 else
7394 {
7395 if (fFcw & X86_FCW_IM)
7396 {
7397 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7398 pFpuRes->r80Result = g_r80Indefinite;
7399 else
7400 {
7401 pFpuRes->r80Result = *pr80Val;
7402 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7403 }
7404 }
7405 else
7406 {
7407 pFpuRes->r80Result = *pr80Val;
7408 fFsw |= X86_FSW_ES | X86_FSW_B;
7409 }
7410 fFsw |= X86_FSW_IE;
7411 }
7412 pFpuRes->FSW = fFsw;
7413}
7414
7415
7416/**
7417 * @code{.unparsed}
7418 * x x * ln2
7419 * f(x) = 2 - 1 = e - 1
7420 *
7421 * @endcode
7422 *
7423 * We can approximate e^x by a Taylor/Maclaurin series (see
7424 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7425 * @code{.unparsed}
7426 * n 0 1 2 3 4
7427 * inf x x x x x x
7428 * SUM ----- = --- + --- + --- + --- + --- + ...
7429 * n=0 n! 0! 1! 2! 3! 4!
7430 *
7431 * 2 3 4
7432 * x x x
7433 * = 1 + x + --- + --- + --- + ...
7434 * 2! 3! 4!
7435 * @endcode
7436 *
7437 * Given z = x * ln2, we get:
7438 * @code{.unparsed}
7439 * 2 3 4 n
7440 * z z z z z
7441 * e - 1 = z + --- + --- + --- + ... + ---
7442 * 2! 3! 4! n!
7443 * @endcode
7444 *
7445 * Wanting to use Horner's method, we move one z outside and get:
7446 * @code{.unparsed}
7447 * 2 3 (n-1)
7448 * z z z z
7449 * = z ( 1 + --- + --- + --- + ... + ------- )
7450 * 2! 3! 4! n!
7451 * @endcode
7452 *
7453 * The constants we need for using Horner's methods are 1 and 1 / n!.
7454 *
7455 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7456 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7457 * and can approximate it to be 1.0. For a visual demonstration of this
7458 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7459 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7460 *
7461 *
7462 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7463 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7464 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7465 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7466 * blocks). (The one bit difference is probably an implicit one missing from
7467 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7468 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7469 * exponent.
7470 *
7471 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7472 * successfully reproduced the exact results from an Intel 10980XE, there is
7473 * always a portition of rounding differences. Not going to spend too much time
7474 * on getting this 100% the same, at least not now.
7475 *
7476 * P.S. If someone are really curious about 8087 and its contstants:
7477 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7478 *
7479 *
7480 * @param pr80Val The exponent value (x), less than 1.0, greater than
7481 * -1.0 and not zero. This can be a normal, denormal
7482 * or pseudo-denormal value.
7483 * @param pr80Result Where to return the result.
7484 * @param fFcw FPU control word.
7485 * @param fFsw FPU status word.
7486 */
7487static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7488{
7489 /* As mentioned above, we can skip the expensive polynomial calculation
7490 as it will be close enough to 1.0 that it makes no difference.
7491
7492 The cutoff point for intel 10980XE is exponents >= -69. Intel
7493 also seems to be using a 67-bit or 68-bit constant value, and we get
7494 a smattering of rounding differences if we go for higher precision. */
7495 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7496 {
7497 RTUINT256U u256;
7498 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7499 u256.QWords.qw0 |= 1; /* force #PE */
7500 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7501 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7502 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7503 : 1 - RTFLOAT80U_EXP_BIAS,
7504 fFcw, fFsw);
7505 }
7506 else
7507 {
7508#ifdef IEM_WITH_FLOAT128_FOR_FPU
7509 /* This approach is not good enough for small values, we end up with zero. */
7510 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7511 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7512 _Float128 rd128Result = powf128(2.0L, rd128Val);
7513 rd128Result -= 1.0L;
7514 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7515 iemFpuF128RestoreRounding(fOldRounding);
7516
7517# else
7518 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7519 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7520
7521 /* As mentioned above, enforce 68-bit internal mantissa width to better
7522 match the Intel 10980XE results. */
7523 unsigned const cPrecision = 68;
7524
7525 /* first calculate z = x * ln2 */
7526 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7527 cPrecision);
7528
7529 /* Then do the polynomial evaluation. */
7530 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7531 cPrecision, &SoftState);
7532 r = f128_mul(z, r, &SoftState);
7533
7534 /* Output the result. */
7535 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7536# endif
7537 }
7538 return fFsw;
7539}
7540
7541
7542IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7543{
7544 uint16_t const fFcw = pFpuState->FCW;
7545 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7546
7547 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7548 {
7549 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7550 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7551 else
7552 {
7553 /* Special case:
7554 2^+1.0 - 1.0 = 1.0
7555 2^-1.0 - 1.0 = -0.5 */
7556 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7557 && pr80Val->s.uMantissa == RT_BIT_64(63))
7558 {
7559 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7560 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7561 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7562 }
7563 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7564 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7565 else
7566 pFpuRes->r80Result = *pr80Val;
7567 fFsw |= X86_FSW_PE;
7568 if (!(fFcw & X86_FCW_PM))
7569 fFsw |= X86_FSW_ES | X86_FSW_B;
7570 }
7571 }
7572 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7573 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7574 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7575 pFpuRes->r80Result = *pr80Val;
7576 else if (RTFLOAT80U_IS_INF(pr80Val))
7577 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7578 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7579 {
7580 fFsw |= X86_FSW_DE;
7581 if (fFcw & X86_FCW_DM)
7582 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7583 else
7584 {
7585 pFpuRes->r80Result = *pr80Val;
7586 fFsw |= X86_FSW_ES | X86_FSW_B;
7587 }
7588 }
7589 else
7590 {
7591 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7592 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7593 && (fFcw & X86_FCW_IM))
7594 pFpuRes->r80Result = g_r80Indefinite;
7595 else
7596 {
7597 pFpuRes->r80Result = *pr80Val;
7598 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7599 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7600 }
7601 fFsw |= X86_FSW_IE;
7602 if (!(fFcw & X86_FCW_IM))
7603 fFsw |= X86_FSW_ES | X86_FSW_B;
7604 }
7605 pFpuRes->FSW = fFsw;
7606}
7607
7608#endif /* IEM_WITHOUT_ASSEMBLY */
7609
7610IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7611{
7612 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7613}
7614
7615IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7616{
7617 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7618}
7619
7620#ifdef IEM_WITHOUT_ASSEMBLY
7621
7622IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7623{
7624 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7625 pFpuRes->r80Result = *pr80Val;
7626 pFpuRes->r80Result.s.fSign = 0;
7627}
7628
7629
7630IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7631{
7632 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7633 pFpuRes->r80Result = *pr80Val;
7634 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7635}
7636
7637
7638IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7639{
7640 uint16_t const fFcw = pFpuState->FCW;
7641 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7642
7643 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7644 {
7645 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7646 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7647
7648 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7649 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7650 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7651 }
7652 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7653 {
7654 fFsw |= X86_FSW_ZE;
7655 if (fFcw & X86_FCW_ZM)
7656 {
7657 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7658 pFpuResTwo->r80Result2 = *pr80Val;
7659 }
7660 else
7661 {
7662 pFpuResTwo->r80Result2 = *pr80Val;
7663 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7664 }
7665 }
7666 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7667 {
7668 fFsw |= X86_FSW_DE;
7669 if (fFcw & X86_FCW_DM)
7670 {
7671 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7672 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7673 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7674 int32_t iExponent = -16382;
7675 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7676 {
7677 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7678 iExponent--;
7679 }
7680
7681 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7682 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7683 }
7684 else
7685 {
7686 pFpuResTwo->r80Result2 = *pr80Val;
7687 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7688 }
7689 }
7690 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7691 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7692 {
7693 pFpuResTwo->r80Result1 = *pr80Val;
7694 pFpuResTwo->r80Result2 = *pr80Val;
7695 }
7696 else if (RTFLOAT80U_IS_INF(pr80Val))
7697 {
7698 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7699 pFpuResTwo->r80Result2 = *pr80Val;
7700 }
7701 else
7702 {
7703 if (fFcw & X86_FCW_IM)
7704 {
7705 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7706 pFpuResTwo->r80Result1 = g_r80Indefinite;
7707 else
7708 {
7709 pFpuResTwo->r80Result1 = *pr80Val;
7710 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7711 }
7712 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7713 }
7714 else
7715 {
7716 pFpuResTwo->r80Result2 = *pr80Val;
7717 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7718 }
7719 fFsw |= X86_FSW_IE;
7720 }
7721 pFpuResTwo->FSW = fFsw;
7722}
7723#endif /* IEM_WITHOUT_ASSEMBLY */
7724
7725#if defined(IEM_WITHOUT_ASSEMBLY)
7726
7727static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7728{
7729 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7730 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7731 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7732 extFloat80_t v;
7733 (void)fFcw;
7734
7735 v = extF80_ylog2x(y, x, &SoftState);
7736 iemFpuSoftF80ToIprt(pr80Result, v);
7737
7738 return fFsw;
7739}
7740
7741IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7742 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7743{
7744 uint16_t const fFcw = pFpuState->FCW;
7745 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7746
7747 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7748 {
7749 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7750
7751 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7752 if (!(fFcw & X86_FCW_PM))
7753 fFsw |= X86_FSW_ES | X86_FSW_B;
7754 }
7755 else
7756 {
7757 fFsw |= X86_FSW_IE;
7758
7759 if (!(fFcw & X86_FCW_IM))
7760 {
7761 pFpuRes->r80Result = *pr80Val2;
7762 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7763 }
7764 else
7765 {
7766 pFpuRes->r80Result = g_r80Indefinite;
7767 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7768 }
7769 }
7770
7771 pFpuRes->FSW = fFsw;
7772}
7773#endif /* IEM_WITHOUT_ASSEMBLY */
7774
7775IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7776 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7777{
7778 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7779}
7780
7781IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7782 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7783{
7784 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7785}
7786
7787#if defined(IEM_WITHOUT_ASSEMBLY)
7788
7789static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7790{
7791 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7792 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7793 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7794 extFloat80_t v;
7795 (void)fFcw;
7796
7797 v = extF80_ylog2xp1(y, x, &SoftState);
7798 iemFpuSoftF80ToIprt(pr80Result, v);
7799
7800 return fFsw;
7801}
7802
7803IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7804 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7805{
7806 uint16_t const fFcw = pFpuState->FCW;
7807 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7808
7809 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7810 {
7811 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7812
7813 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7814 if (!(fFcw & X86_FCW_PM))
7815 fFsw |= X86_FSW_ES | X86_FSW_B;
7816 }
7817 else
7818 {
7819 fFsw |= X86_FSW_IE;
7820
7821 if (!(fFcw & X86_FCW_IM))
7822 {
7823 pFpuRes->r80Result = *pr80Val2;
7824 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7825 }
7826 else
7827 {
7828 pFpuRes->r80Result = g_r80Indefinite;
7829 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7830 }
7831 }
7832
7833 pFpuRes->FSW = fFsw;
7834}
7835
7836#endif /* IEM_WITHOUT_ASSEMBLY */
7837
7838IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7839 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7840{
7841 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7842}
7843
7844IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7845 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7846{
7847 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7848}
7849
7850
7851/*********************************************************************************************************************************
7852* MMX, SSE & AVX *
7853*********************************************************************************************************************************/
7854
7855/*
7856 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7857 */
7858#ifdef IEM_WITHOUT_ASSEMBLY
7859
7860IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(uint64_t *puDst, uint64_t const *puSrc))
7861{
7862 *puDst &= *puSrc;
7863}
7864
7865
7866IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7867{
7868 puDst->au64[0] &= puSrc->au64[0];
7869 puDst->au64[1] &= puSrc->au64[1];
7870}
7871
7872#endif
7873
7874IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7875{
7876 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7877 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7878}
7879
7880
7881IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7882{
7883 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7884 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7885 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7886 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7887}
7888
7889
7890/*
7891 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7892 */
7893#ifdef IEM_WITHOUT_ASSEMBLY
7894
7895IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(uint64_t *puDst, uint64_t const *puSrc))
7896{
7897 *puDst = ~*puDst & *puSrc;
7898}
7899
7900
7901IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7902{
7903 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7904 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7905}
7906
7907#endif
7908
7909IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7910{
7911 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7912 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7913}
7914
7915
7916IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7917{
7918 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7919 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7920 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7921 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7922}
7923
7924
7925/*
7926 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7927 */
7928#ifdef IEM_WITHOUT_ASSEMBLY
7929
7930IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(uint64_t *puDst, uint64_t const *puSrc))
7931{
7932 *puDst |= *puSrc;
7933}
7934
7935
7936IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7937{
7938 puDst->au64[0] |= puSrc->au64[0];
7939 puDst->au64[1] |= puSrc->au64[1];
7940}
7941
7942#endif
7943
7944IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7945{
7946 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7947 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7948}
7949
7950
7951IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7952{
7953 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7954 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7955 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7956 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7957}
7958
7959
7960/*
7961 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7962 */
7963#ifdef IEM_WITHOUT_ASSEMBLY
7964
7965IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(uint64_t *puDst, uint64_t const *puSrc))
7966{
7967 *puDst ^= *puSrc;
7968}
7969
7970
7971IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7972{
7973 puDst->au64[0] ^= puSrc->au64[0];
7974 puDst->au64[1] ^= puSrc->au64[1];
7975}
7976
7977#endif
7978
7979IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7980{
7981 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7982 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7983}
7984
7985
7986IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7987{
7988 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7989 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7990 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7991 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7992}
7993
7994
7995/*
7996 * PCMPEQB / VPCMPEQB
7997 */
7998#ifdef IEM_WITHOUT_ASSEMBLY
7999
8000IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8001{
8002 RTUINT64U uSrc1 = { *puDst };
8003 RTUINT64U uSrc2 = { *puSrc };
8004 RTUINT64U uDst;
8005 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
8006 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
8007 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
8008 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
8009 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
8010 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
8011 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
8012 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
8013 *puDst = uDst.u;
8014}
8015
8016
8017IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8018{
8019 RTUINT128U uSrc1 = *puDst;
8020 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
8021 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
8022 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
8023 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
8024 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
8025 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
8026 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
8027 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
8028 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
8029 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
8030 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
8031 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
8032 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
8033 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
8034 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
8035 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
8036}
8037
8038#endif
8039
8040IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8041{
8042 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8043 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8044 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8045 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8046 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8047 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8048 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8049 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8050 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8051 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8052 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8053 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8054 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8055 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8056 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8057 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8058}
8059
8060IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8061{
8062 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8063 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8064 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8065 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8066 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8067 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8068 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8069 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8070 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8071 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8072 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8073 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8074 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8075 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8076 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8077 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8078 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8079 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8080 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8081 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8082 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8083 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8084 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8085 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8086 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8087 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8088 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8089 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8090 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8091 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8092 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8093 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8094}
8095
8096
8097/*
8098 * PCMPEQW / VPCMPEQW
8099 */
8100#ifdef IEM_WITHOUT_ASSEMBLY
8101
8102IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8103{
8104 RTUINT64U uSrc1 = { *puDst };
8105 RTUINT64U uSrc2 = { *puSrc };
8106 RTUINT64U uDst;
8107 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8108 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8109 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8110 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8111 *puDst = uDst.u;
8112}
8113
8114
8115IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8116{
8117 RTUINT128U uSrc1 = *puDst;
8118 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8119 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8120 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8121 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8122 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8123 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8124 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8125 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8126}
8127
8128#endif
8129
8130IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8131{
8132 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8133 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8134 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8135 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8136 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8137 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8138 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8139 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8140}
8141
8142IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8143{
8144 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8145 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8146 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8147 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8148 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8149 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8150 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8151 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8152 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8153 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8154 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8155 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8156 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8157 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8158 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8159 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8160}
8161
8162
8163/*
8164 * PCMPEQD / VPCMPEQD.
8165 */
8166#ifdef IEM_WITHOUT_ASSEMBLY
8167
8168IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8169{
8170 RTUINT64U uSrc1 = { *puDst };
8171 RTUINT64U uSrc2 = { *puSrc };
8172 RTUINT64U uDst;
8173 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8174 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8175 *puDst = uDst.u;
8176}
8177
8178
8179IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8180{
8181 RTUINT128U uSrc1 = *puDst;
8182 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8183 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8184 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8185 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8186}
8187
8188#endif /* IEM_WITHOUT_ASSEMBLY */
8189
8190IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8191{
8192 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8193 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8194 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8195 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8196}
8197
8198IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8199{
8200 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8201 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8202 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8203 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8204 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8205 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8206 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8207 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8208}
8209
8210
8211/*
8212 * PCMPEQQ / VPCMPEQQ.
8213 */
8214IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8215{
8216 RTUINT128U uSrc1 = *puDst;
8217 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8218 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8219}
8220
8221IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8222{
8223 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8224 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8225}
8226
8227IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8228{
8229 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8230 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8231 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8232 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8233}
8234
8235
8236/*
8237 * PCMPGTB / VPCMPGTB
8238 */
8239#ifdef IEM_WITHOUT_ASSEMBLY
8240
8241IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8242{
8243 RTUINT64U uSrc1 = { *puDst };
8244 RTUINT64U uSrc2 = { *puSrc };
8245 RTUINT64U uDst;
8246 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8247 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8248 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8249 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8250 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8251 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8252 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8253 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8254 *puDst = uDst.u;
8255}
8256
8257
8258IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8259{
8260 RTUINT128U uSrc1 = *puDst;
8261 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8262 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8263 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8264 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8265 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8266 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8267 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8268 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8269 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8270 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8271 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8272 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8273 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8274 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8275 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8276 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8277}
8278
8279#endif
8280
8281IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8282{
8283 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8284 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8285 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8286 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8287 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8288 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8289 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8290 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8291 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8292 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8293 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8294 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8295 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8296 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8297 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8298 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8299}
8300
8301IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8302{
8303 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8304 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8305 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8306 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8307 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8308 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8309 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8310 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8311 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8312 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8313 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8314 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8315 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8316 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8317 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8318 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8319 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8320 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8321 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8322 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8323 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8324 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8325 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8326 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8327 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8328 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8329 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8330 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8331 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8332 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8333 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8334 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8335}
8336
8337
8338/*
8339 * PCMPGTW / VPCMPGTW
8340 */
8341#ifdef IEM_WITHOUT_ASSEMBLY
8342
8343IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8344{
8345 RTUINT64U uSrc1 = { *puDst };
8346 RTUINT64U uSrc2 = { *puSrc };
8347 RTUINT64U uDst;
8348 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8349 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8350 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8351 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8352 *puDst = uDst.u;
8353}
8354
8355
8356IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8357{
8358 RTUINT128U uSrc1 = *puDst;
8359 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8360 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8361 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8362 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8363 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8364 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8365 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8366 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8367}
8368
8369#endif
8370
8371IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8372{
8373 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8374 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8375 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8376 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8377 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8378 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8379 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8380 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8381}
8382
8383IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8384{
8385 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8386 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8387 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8388 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8389 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8390 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8391 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8392 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8393 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8394 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8395 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8396 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8397 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8398 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8399 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8400 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8401}
8402
8403
8404/*
8405 * PCMPGTD / VPCMPGTD.
8406 */
8407#ifdef IEM_WITHOUT_ASSEMBLY
8408
8409IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8410{
8411 RTUINT64U uSrc1 = { *puDst };
8412 RTUINT64U uSrc2 = { *puSrc };
8413 RTUINT64U uDst;
8414 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8415 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8416 *puDst = uDst.u;
8417}
8418
8419
8420IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8421{
8422 RTUINT128U uSrc1 = *puDst;
8423 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8424 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8425 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8426 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8427}
8428
8429#endif /* IEM_WITHOUT_ASSEMBLY */
8430
8431IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8432{
8433 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8434 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8435 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8436 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8437}
8438
8439IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8440{
8441 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8442 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8443 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8444 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8445 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8446 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8447 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8448 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8449}
8450
8451
8452/*
8453 * PCMPGTQ / VPCMPGTQ.
8454 */
8455IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8456{
8457 RTUINT128U uSrc1 = *puDst;
8458 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8459 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8460}
8461
8462IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8463{
8464 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8465 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8466}
8467
8468IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8469{
8470 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8471 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8472 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8473 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8474}
8475
8476
8477/*
8478 * PADDB / VPADDB
8479 */
8480#ifdef IEM_WITHOUT_ASSEMBLY
8481
8482IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8483{
8484 RTUINT64U uSrc1 = { *puDst };
8485 RTUINT64U uSrc2 = { *puSrc };
8486 RTUINT64U uDst;
8487 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8488 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8489 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8490 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8491 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8492 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8493 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8494 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8495 *puDst = uDst.u;
8496}
8497
8498
8499IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8500{
8501 RTUINT128U uSrc1 = *puDst;
8502 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8503 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8504 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8505 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8506 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8507 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8508 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8509 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8510 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8511 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8512 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8513 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8514 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8515 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8516 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8517 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8518}
8519
8520#endif
8521
8522
8523IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8524{
8525 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8526 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8527 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8528 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8529 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8530 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8531 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8532 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8533 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8534 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8535 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8536 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8537 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8538 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8539 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8540 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8541}
8542
8543IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8544{
8545 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8546 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8547 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8548 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8549 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8550 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8551 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8552 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8553 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8554 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8555 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8556 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8557 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8558 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8559 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8560 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8561 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8562 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8563 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8564 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8565 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8566 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8567 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8568 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8569 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8570 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8571 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8572 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8573 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8574 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8575 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8576 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8577}
8578
8579
8580/*
8581 * PADDSB / VPADDSB
8582 */
8583#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8584 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8585 ? (uint8_t)(a_iWord) \
8586 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8587
8588#ifdef IEM_WITHOUT_ASSEMBLY
8589
8590IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8591{
8592 RTUINT64U uSrc1 = { *puDst };
8593 RTUINT64U uSrc2 = { *puSrc };
8594 RTUINT64U uDst;
8595 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8596 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8597 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8598 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8599 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8600 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8601 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8602 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8603 *puDst = uDst.u;
8604}
8605
8606
8607IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8608{
8609 RTUINT128U uSrc1 = *puDst;
8610 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8611 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8612 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8613 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8614 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8615 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8616 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8617 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8618 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8619 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8620 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8621 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8622 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8623 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8624 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8625 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8626}
8627
8628#endif
8629
8630IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8631 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8632{
8633 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8634 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8635 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8636 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8637 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8638 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8639 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8640 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8641 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8642 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8643 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8644 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8645 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8646 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8647 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8648 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8649}
8650
8651IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8652 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8653{
8654 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8655 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8656 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8657 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8658 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8659 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8660 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8661 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8662 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8663 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8664 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8665 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8666 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8667 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8668 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8669 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8670 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8671 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8672 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8673 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8674 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8675 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8676 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8677 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8678 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8679 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8680 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8681 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8682 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8683 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8684 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8685 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8686}
8687
8688
8689/*
8690 * PADDUSB / VPADDUSB
8691 */
8692#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8693 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8694 ? (uint8_t)(a_uWord) \
8695 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8696
8697#ifdef IEM_WITHOUT_ASSEMBLY
8698
8699IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8700{
8701 RTUINT64U uSrc1 = { *puDst };
8702 RTUINT64U uSrc2 = { *puSrc };
8703 RTUINT64U uDst;
8704 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8705 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8706 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8707 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8708 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8709 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8710 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8711 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8712 *puDst = uDst.u;
8713}
8714
8715
8716IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8717{
8718 RTUINT128U uSrc1 = *puDst;
8719 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8720 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8721 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8722 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8723 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8724 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8725 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8726 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8727 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8728 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8729 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8730 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8731 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8732 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8733 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8734 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8735}
8736
8737#endif
8738
8739IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8740 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8741{
8742 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8743 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8744 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8745 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8746 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8747 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8748 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8749 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8750 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8751 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8752 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8753 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8754 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8755 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8756 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8757 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8758}
8759
8760IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8761 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8762{
8763 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8764 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8765 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8766 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8767 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8768 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8769 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8770 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8771 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8772 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8773 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8774 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8775 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8776 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8777 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8778 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8779 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8780 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8781 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8782 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8783 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8784 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8785 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8786 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8787 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8788 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8789 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8790 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8791 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8792 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8793 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8794 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8795}
8796
8797
8798/*
8799 * PADDW / VPADDW
8800 */
8801#ifdef IEM_WITHOUT_ASSEMBLY
8802
8803IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8804{
8805 RTUINT64U uSrc1 = { *puDst };
8806 RTUINT64U uSrc2 = { *puSrc };
8807 RTUINT64U uDst;
8808 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8809 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8810 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8811 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8812 *puDst = uDst.u;
8813}
8814
8815
8816IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8817{
8818 RTUINT128U uSrc1 = *puDst;
8819 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8820 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8821 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8822 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8823 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8824 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8825 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8826 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8827}
8828
8829#endif
8830
8831
8832IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8833{
8834 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8835 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8836 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8837 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8838 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8839 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8840 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8841 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8842}
8843
8844IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8845{
8846 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8847 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8848 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8849 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8850 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8851 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8852 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8853 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8854 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8855 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8856 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8857 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8858 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8859 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8860 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8861 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8862}
8863
8864
8865/*
8866 * PADDSW / VPADDSW
8867 */
8868#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8869 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8870 ? (uint16_t)(a_iDword) \
8871 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8872
8873#ifdef IEM_WITHOUT_ASSEMBLY
8874
8875IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8876{
8877 RTUINT64U uSrc1 = { *puDst };
8878 RTUINT64U uSrc2 = { *puSrc };
8879 RTUINT64U uDst;
8880 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8881 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8882 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8883 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8884 *puDst = uDst.u;
8885}
8886
8887
8888IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8889{
8890 RTUINT128U uSrc1 = *puDst;
8891 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8892 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8893 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8894 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8895 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8896 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8897 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8898 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8899}
8900
8901#endif
8902
8903IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8904 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8905{
8906 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8907 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8908 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8909 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8910 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8911 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8912 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8913 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8914}
8915
8916IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8917 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8918{
8919 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8920 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8921 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8922 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8923 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8924 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8925 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8926 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8927 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8928 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8929 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8930 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8931 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8932 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8933 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8934 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8935}
8936
8937
8938/*
8939 * PADDUSW / VPADDUSW
8940 */
8941#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8942 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8943 ? (uint16_t)(a_uDword) \
8944 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8945
8946#ifdef IEM_WITHOUT_ASSEMBLY
8947
8948IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8949{
8950 RTUINT64U uSrc1 = { *puDst };
8951 RTUINT64U uSrc2 = { *puSrc };
8952 RTUINT64U uDst;
8953 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8954 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8955 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8956 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8957 *puDst = uDst.u;
8958}
8959
8960
8961IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8962{
8963 RTUINT128U uSrc1 = *puDst;
8964 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8965 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8966 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8967 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8968 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8969 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8970 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8971 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8972}
8973
8974#endif
8975
8976IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
8977 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8978{
8979 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8980 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8981 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8982 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8983 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8984 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8985 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8986 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8987}
8988
8989IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
8990 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8991{
8992 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8993 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8994 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8995 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8996 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8997 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8998 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8999 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9000 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
9001 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
9002 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
9003 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
9004 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
9005 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
9006 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
9007 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
9008}
9009
9010
9011/*
9012 * PADDD / VPADDD.
9013 */
9014#ifdef IEM_WITHOUT_ASSEMBLY
9015
9016IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(uint64_t *puDst, uint64_t const *puSrc))
9017{
9018 RTUINT64U uSrc1 = { *puDst };
9019 RTUINT64U uSrc2 = { *puSrc };
9020 RTUINT64U uDst;
9021 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
9022 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
9023 *puDst = uDst.u;
9024}
9025
9026
9027IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9028{
9029 RTUINT128U uSrc1 = *puDst;
9030 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
9031 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
9032 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
9033 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
9034}
9035
9036#endif /* IEM_WITHOUT_ASSEMBLY */
9037
9038IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9039{
9040 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9041 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9042 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9043 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9044}
9045
9046IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9047{
9048 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9049 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9050 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9051 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9052 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
9053 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
9054 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
9055 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
9056}
9057
9058
9059/*
9060 * PADDQ / VPADDQ.
9061 */
9062#ifdef IEM_WITHOUT_ASSEMBLY
9063
9064IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9065{
9066 *puDst = *puDst + *puSrc;
9067}
9068
9069IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9070{
9071 RTUINT128U uSrc1 = *puDst;
9072 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
9073 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
9074}
9075
9076#endif
9077
9078IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9079{
9080 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9081 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9082}
9083
9084IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9085{
9086 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9087 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9088 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9089 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9090}
9091
9092
9093/*
9094 * PSUBB / VPSUBB
9095 */
9096#ifdef IEM_WITHOUT_ASSEMBLY
9097
9098IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9099{
9100 RTUINT64U uSrc1 = { *puDst };
9101 RTUINT64U uSrc2 = { *puSrc };
9102 RTUINT64U uDst;
9103 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9104 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9105 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9106 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9107 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9108 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9109 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9110 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9111 *puDst = uDst.u;
9112}
9113
9114
9115IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9116{
9117 RTUINT128U uSrc1 = *puDst;
9118 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9119 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9120 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9121 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9122 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9123 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9124 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9125 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9126 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9127 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9128 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9129 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9130 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9131 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9132 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9133 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9134}
9135
9136#endif
9137
9138IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9139{
9140 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9141 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9142 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9143 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9144 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9145 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9146 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9147 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9148 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9149 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9150 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9151 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9152 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9153 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9154 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9155 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9156}
9157
9158IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9159{
9160 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9161 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9162 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9163 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9164 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9165 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9166 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9167 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9168 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9169 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9170 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9171 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9172 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9173 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9174 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9175 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9176 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9177 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9178 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9179 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9180 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9181 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9182 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9183 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9184 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9185 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9186 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9187 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9188 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9189 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9190 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9191 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9192}
9193
9194
9195/*
9196 * PSUBSB / VSUBSB
9197 */
9198#ifdef IEM_WITHOUT_ASSEMBLY
9199
9200IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9201{
9202 RTUINT64U uSrc1 = { *puDst };
9203 RTUINT64U uSrc2 = { *puSrc };
9204 RTUINT64U uDst;
9205 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9206 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9207 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9208 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9209 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9210 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9211 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9212 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9213 *puDst = uDst.u;
9214}
9215
9216
9217IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9218{
9219 RTUINT128U uSrc1 = *puDst;
9220 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9221 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9222 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9223 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9224 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9225 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9226 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9227 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9228 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9229 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9230 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9231 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9232 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9233 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9234 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9235 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9236}
9237
9238#endif
9239
9240IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9241 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9242{
9243 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9244 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9245 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9246 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9247 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9248 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9249 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9250 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9251 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9252 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9253 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9254 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9255 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9256 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9257 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9258 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9259}
9260
9261IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9262 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9263{
9264 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9265 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9266 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9267 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9268 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9269 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9270 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9271 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9272 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9273 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9274 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9275 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9276 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9277 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9278 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9279 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9280 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9281 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9282 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9283 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9284 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9285 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9286 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9287 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9288 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9289 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9290 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9291 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9292 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9293 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9294 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9295 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9296}
9297
9298
9299/*
9300 * PSUBUSB / VPSUBUSW
9301 */
9302#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9303 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9304 ? (uint8_t)(a_uWord) \
9305 : (uint8_t)0 )
9306
9307#ifdef IEM_WITHOUT_ASSEMBLY
9308
9309IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9310{
9311 RTUINT64U uSrc1 = { *puDst };
9312 RTUINT64U uSrc2 = { *puSrc };
9313 RTUINT64U uDst;
9314 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9315 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9316 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9317 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9318 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9319 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9320 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9321 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9322 *puDst = uDst.u;
9323}
9324
9325
9326IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9327{
9328 RTUINT128U uSrc1 = *puDst;
9329 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9330 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9331 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9332 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9333 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9334 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9335 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9336 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9337 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9338 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9339 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9340 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9341 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9342 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9343 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9344 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9345}
9346
9347#endif
9348
9349IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9350 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9351{
9352 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9353 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9354 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9355 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9356 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9357 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9358 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9359 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9360 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9361 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9362 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9363 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9364 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9365 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9366 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9367 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9368}
9369
9370IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9371 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9372{
9373 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9374 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9375 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9376 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9377 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9378 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9379 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9380 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9381 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9382 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9383 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9384 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9385 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9386 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9387 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9388 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9389 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9390 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9391 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9392 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9393 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9394 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9395 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9396 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9397 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9398 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9399 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9400 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9401 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9402 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9403 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9404 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9405}
9406
9407
9408/*
9409 * PSUBW / VPSUBW
9410 */
9411#ifdef IEM_WITHOUT_ASSEMBLY
9412
9413IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9414{
9415 RTUINT64U uSrc1 = { *puDst };
9416 RTUINT64U uSrc2 = { *puSrc };
9417 RTUINT64U uDst;
9418 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9419 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9420 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9421 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9422 *puDst = uDst.u;
9423}
9424
9425
9426IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9427{
9428 RTUINT128U uSrc1 = *puDst;
9429 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9430 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9431 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9432 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9433 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9434 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9435 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9436 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9437}
9438
9439#endif
9440
9441IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9442{
9443 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9444 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9445 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9446 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9447 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9448 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9449 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9450 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9451}
9452
9453IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9454{
9455 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9456 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9457 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9458 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9459 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9460 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9461 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9462 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9463 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9464 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9465 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9466 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9467 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9468 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9469 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9470 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9471}
9472
9473
9474/*
9475 * PSUBSW / VPSUBSW
9476 */
9477#ifdef IEM_WITHOUT_ASSEMBLY
9478
9479IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9480{
9481 RTUINT64U uSrc1 = { *puDst };
9482 RTUINT64U uSrc2 = { *puSrc };
9483 RTUINT64U uDst;
9484 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9485 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9486 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9487 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9488 *puDst = uDst.u;
9489}
9490
9491
9492IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9493{
9494 RTUINT128U uSrc1 = *puDst;
9495 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9496 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9497 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9498 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9499 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9500 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9501 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9502 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9503}
9504
9505#endif
9506
9507IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9508 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9509{
9510 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9511 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9512 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9513 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9514 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9515 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9516 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9517 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9518}
9519
9520IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9521 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9522{
9523 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9524 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9525 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9526 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9527 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9528 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9529 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9530 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9531 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9532 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9533 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9534 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9535 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9536 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9537 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9538 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9539}
9540
9541
9542/*
9543 * PSUBUSW / VPSUBUSW
9544 */
9545#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9546 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9547 ? (uint16_t)(a_uDword) \
9548 : (uint16_t)0 )
9549
9550#ifdef IEM_WITHOUT_ASSEMBLY
9551
9552IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9553{
9554 RTUINT64U uSrc1 = { *puDst };
9555 RTUINT64U uSrc2 = { *puSrc };
9556 RTUINT64U uDst;
9557 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9558 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9559 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9560 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9561 *puDst = uDst.u;
9562}
9563
9564
9565IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9566{
9567 RTUINT128U uSrc1 = *puDst;
9568 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9569 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9570 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9571 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9572 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9573 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9574 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9575 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9576}
9577
9578#endif
9579
9580IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9581 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9582{
9583 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9584 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9585 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9586 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9587 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9588 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9589 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9590 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9591}
9592
9593IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9594 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9595{
9596 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9597 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9598 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9599 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9600 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9601 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9602 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9603 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9604 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9605 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9606 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9607 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9608 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9609 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9610 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9611 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9612}
9613
9614
9615
9616/*
9617 * PSUBD / VPSUBD.
9618 */
9619#ifdef IEM_WITHOUT_ASSEMBLY
9620
9621IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(uint64_t *puDst, uint64_t const *puSrc))
9622{
9623 RTUINT64U uSrc1 = { *puDst };
9624 RTUINT64U uSrc2 = { *puSrc };
9625 RTUINT64U uDst;
9626 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9627 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9628 *puDst = uDst.u;
9629}
9630
9631
9632IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9633{
9634 RTUINT128U uSrc1 = *puDst;
9635 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9636 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9637 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9638 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9639}
9640
9641#endif /* IEM_WITHOUT_ASSEMBLY */
9642
9643IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9644{
9645 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9646 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9647 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9648 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9649}
9650
9651IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9652{
9653 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9654 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9655 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9656 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9657 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9658 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9659 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9660 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9661}
9662
9663
9664/*
9665 * PSUBQ / VPSUBQ.
9666 */
9667#ifdef IEM_WITHOUT_ASSEMBLY
9668
9669IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9670{
9671 *puDst = *puDst - *puSrc;
9672}
9673
9674IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9675{
9676 RTUINT128U uSrc1 = *puDst;
9677 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9678 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9679}
9680
9681#endif
9682
9683IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9684{
9685 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9686 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9687}
9688
9689IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9690{
9691 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9692 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9693 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9694 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9695}
9696
9697
9698
9699/*
9700 * PMULLW / VPMULLW / PMULLD / VPMULLD
9701 */
9702#ifdef IEM_WITHOUT_ASSEMBLY
9703
9704IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9705{
9706 RTUINT64U uSrc1 = { *puDst };
9707 RTUINT64U uSrc2 = { *puSrc };
9708 RTUINT64U uDst;
9709 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9710 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9711 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9712 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9713 *puDst = uDst.u;
9714}
9715
9716
9717IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9718{
9719 RTUINT128U uSrc1 = *puDst;
9720 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9721 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9722 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9723 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9724 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9725 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9726 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9727 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9728}
9729
9730#endif
9731
9732IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9733{
9734 RTUINT128U uSrc1 = *puDst;
9735
9736 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9737 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9738 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9739 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9740}
9741
9742
9743IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9744{
9745 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9746 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9747 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9748 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9749 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9750 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9751 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9752 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9753}
9754
9755
9756IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9757{
9758 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9759 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9760 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9761 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9762 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9763 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9764 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9765 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9766 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9767 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9768 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9769 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9770 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9771 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9772 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9773 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9774}
9775
9776
9777IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9778{
9779 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9780 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9781 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9782 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9783}
9784
9785
9786IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9787{
9788 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9789 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9790 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9791 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9792 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9793 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9794 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9795 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9796}
9797
9798
9799/*
9800 * PMULHW / VPMULHW
9801 */
9802#ifdef IEM_WITHOUT_ASSEMBLY
9803
9804IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9805{
9806 RTUINT64U uSrc1 = { *puDst };
9807 RTUINT64U uSrc2 = { *puSrc };
9808 RTUINT64U uDst;
9809 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9810 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9811 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9812 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9813 *puDst = uDst.u;
9814}
9815
9816
9817IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9818{
9819 RTUINT128U uSrc1 = *puDst;
9820 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9821 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9822 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9823 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9824 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9825 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9826 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9827 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9828}
9829
9830#endif
9831
9832IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9833{
9834 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9835 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9836 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9837 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9838 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9839 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9840 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9841 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9842}
9843
9844
9845IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9846{
9847 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9848 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9849 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9850 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9851 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9852 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9853 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9854 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9855 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9856 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9857 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9858 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9859 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9860 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9861 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9862 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9863}
9864
9865
9866/*
9867 * PMULHUW / VPMULHUW
9868 */
9869#ifdef IEM_WITHOUT_ASSEMBLY
9870
9871IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9872{
9873 RTUINT64U uSrc1 = { *puDst };
9874 RTUINT64U uSrc2 = { *puSrc };
9875 RTUINT64U uDst;
9876 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9877 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9878 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9879 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9880 *puDst = uDst.u;
9881}
9882
9883
9884IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9885{
9886 RTUINT128U uSrc1 = *puDst;
9887 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9888 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9889 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9890 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9891 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9892 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9893 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9894 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9895}
9896
9897#endif
9898
9899IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9900{
9901 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9902 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9903 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9904 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9905 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9906 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9907 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9908 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9909}
9910
9911
9912IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9913{
9914 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9915 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9916 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9917 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9918 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9919 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9920 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9921 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9922 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9923 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9924 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9925 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9926 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9927 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9928 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9929 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9930}
9931
9932
9933/*
9934 * PSRLW / VPSRLW
9935 */
9936#ifdef IEM_WITHOUT_ASSEMBLY
9937
9938IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9939{
9940 RTUINT64U uSrc1 = { *puDst };
9941 RTUINT64U uSrc2 = { *puSrc };
9942 RTUINT64U uDst;
9943
9944 if (uSrc2.au64[0] <= 15)
9945 {
9946 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9947 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9948 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9949 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9950 }
9951 else
9952 {
9953 uDst.au64[0] = 0;
9954 }
9955 *puDst = uDst.u;
9956}
9957
9958
9959IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9960{
9961 RTUINT64U uSrc1 = { *puDst };
9962 RTUINT64U uDst;
9963
9964 if (uShift <= 15)
9965 {
9966 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9967 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9968 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9969 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9970 }
9971 else
9972 {
9973 uDst.au64[0] = 0;
9974 }
9975 *puDst = uDst.u;
9976}
9977
9978
9979IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9980{
9981 RTUINT128U uSrc1 = *puDst;
9982
9983 if (puSrc->au64[0] <= 15)
9984 {
9985 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9986 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9987 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9988 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9989 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9990 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9991 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9992 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9993 }
9994 else
9995 {
9996 puDst->au64[0] = 0;
9997 puDst->au64[1] = 0;
9998 }
9999}
10000
10001IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10002{
10003 RTUINT128U uSrc1 = *puDst;
10004
10005 if (uShift <= 15)
10006 {
10007 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10008 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10009 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10010 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10011 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10012 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10013 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10014 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10015 }
10016 else
10017 {
10018 puDst->au64[0] = 0;
10019 puDst->au64[1] = 0;
10020 }
10021}
10022
10023#endif
10024
10025IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10026{
10027 RTUINT128U uSrc1 = *puSrc1;
10028
10029 if (uShift <= 15)
10030 {
10031 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10032 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10033 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10034 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10035 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10036 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10037 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10038 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10039 }
10040 else
10041 {
10042 puDst->au64[0] = 0;
10043 puDst->au64[1] = 0;
10044 }
10045}
10046
10047IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10048{
10049 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10050}
10051
10052IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10053{
10054 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, uShift);
10055}
10056
10057IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10058{
10059 RTUINT256U uSrc1 = *puSrc1;
10060
10061 if (uShift <= 15)
10062 {
10063 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10064 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10065 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10066 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10067 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10068 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10069 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10070 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10071 puDst->au16[8] = uSrc1.au16[8] >> uShift;
10072 puDst->au16[9] = uSrc1.au16[9] >> uShift;
10073 puDst->au16[10] = uSrc1.au16[10] >> uShift;
10074 puDst->au16[11] = uSrc1.au16[11] >> uShift;
10075 puDst->au16[12] = uSrc1.au16[12] >> uShift;
10076 puDst->au16[13] = uSrc1.au16[13] >> uShift;
10077 puDst->au16[14] = uSrc1.au16[14] >> uShift;
10078 puDst->au16[15] = uSrc1.au16[15] >> uShift;
10079 }
10080 else
10081 {
10082 puDst->au64[0] = 0;
10083 puDst->au64[1] = 0;
10084 puDst->au64[2] = 0;
10085 puDst->au64[3] = 0;
10086 }
10087}
10088
10089IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10090{
10091 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, uShift);
10092}
10093
10094IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10095{
10096 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10097}
10098
10099
10100/*
10101 * PSRAW / VPSRAW
10102 */
10103#ifdef IEM_WITHOUT_ASSEMBLY
10104
10105IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10106{
10107 RTUINT64U uSrc1 = { *puDst };
10108 RTUINT64U uSrc2 = { *puSrc };
10109 RTUINT64U uDst;
10110 uint8_t uShift;
10111
10112 uShift = RT_MIN(15, uSrc2.au64[0]);
10113
10114 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10115 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10116 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10117 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10118
10119 *puDst = uDst.u;
10120}
10121
10122
10123IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10124{
10125 RTUINT64U uSrc1 = { *puDst };
10126 RTUINT64U uDst;
10127
10128 uShift = RT_MIN(15, uShift);
10129
10130 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10131 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10132 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10133 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10134
10135 *puDst = uDst.u;
10136}
10137
10138
10139IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10140{
10141 RTUINT128U uSrc1 = *puDst;
10142 uint8_t uShift;
10143
10144 uShift = RT_MIN(15, puSrc->au64[0]);
10145
10146 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10147 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10148 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10149 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10150 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10151 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10152 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10153 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10154}
10155
10156IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10157{
10158 RTUINT128U uSrc1 = *puDst;
10159
10160 uShift = RT_MIN(15, uShift);
10161
10162 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10163 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10164 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10165 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10166 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10167 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10168 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10169 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10170}
10171
10172#endif
10173
10174IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10175{
10176 RTUINT128U uSrc1 = *puSrc1;
10177
10178 uShift = RT_MIN(15, uShift);
10179
10180 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10181 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10182 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10183 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10184 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10185 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10186 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10187 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10188}
10189
10190IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10191{
10192 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10193}
10194
10195IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10196{
10197 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, uShift);
10198}
10199
10200IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10201{
10202 RTUINT256U uSrc1 = *puSrc1;
10203
10204 uShift = RT_MIN(15, uShift);
10205
10206 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10207 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10208 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10209 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10210 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10211 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10212 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10213 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10214 puDst->ai16[8] = uSrc1.ai16[8] >> uShift;
10215 puDst->ai16[9] = uSrc1.ai16[9] >> uShift;
10216 puDst->ai16[10] = uSrc1.ai16[10] >> uShift;
10217 puDst->ai16[11] = uSrc1.ai16[11] >> uShift;
10218 puDst->ai16[12] = uSrc1.ai16[12] >> uShift;
10219 puDst->ai16[13] = uSrc1.ai16[13] >> uShift;
10220 puDst->ai16[14] = uSrc1.ai16[14] >> uShift;
10221 puDst->ai16[15] = uSrc1.ai16[15] >> uShift;
10222}
10223
10224IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10225{
10226 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, uShift);
10227}
10228
10229IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10230{
10231 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10232}
10233
10234
10235/*
10236 * PSLLW / VPSLLW
10237 */
10238#ifdef IEM_WITHOUT_ASSEMBLY
10239
10240IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10241{
10242 RTUINT64U uSrc1 = { *puDst };
10243 RTUINT64U uSrc2 = { *puSrc };
10244 RTUINT64U uDst;
10245
10246 if (uSrc2.au64[0] <= 15)
10247 {
10248 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10249 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10250 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10251 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10252 }
10253 else
10254 {
10255 uDst.au64[0] = 0;
10256 }
10257 *puDst = uDst.u;
10258}
10259
10260
10261IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10262{
10263 RTUINT64U uSrc1 = { *puDst };
10264 RTUINT64U uDst;
10265
10266 if (uShift <= 15)
10267 {
10268 uDst.au16[0] = uSrc1.au16[0] << uShift;
10269 uDst.au16[1] = uSrc1.au16[1] << uShift;
10270 uDst.au16[2] = uSrc1.au16[2] << uShift;
10271 uDst.au16[3] = uSrc1.au16[3] << uShift;
10272 }
10273 else
10274 {
10275 uDst.au64[0] = 0;
10276 }
10277 *puDst = uDst.u;
10278}
10279
10280
10281IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10282{
10283 RTUINT128U uSrc1 = *puDst;
10284
10285 if (puSrc->au64[0] <= 15)
10286 {
10287 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10288 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10289 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10290 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10291 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10292 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10293 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10294 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10295 }
10296 else
10297 {
10298 puDst->au64[0] = 0;
10299 puDst->au64[1] = 0;
10300 }
10301}
10302
10303IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10304{
10305 RTUINT128U uSrc1 = *puDst;
10306
10307 if (uShift <= 15)
10308 {
10309 puDst->au16[0] = uSrc1.au16[0] << uShift;
10310 puDst->au16[1] = uSrc1.au16[1] << uShift;
10311 puDst->au16[2] = uSrc1.au16[2] << uShift;
10312 puDst->au16[3] = uSrc1.au16[3] << uShift;
10313 puDst->au16[4] = uSrc1.au16[4] << uShift;
10314 puDst->au16[5] = uSrc1.au16[5] << uShift;
10315 puDst->au16[6] = uSrc1.au16[6] << uShift;
10316 puDst->au16[7] = uSrc1.au16[7] << uShift;
10317 }
10318 else
10319 {
10320 puDst->au64[0] = 0;
10321 puDst->au64[1] = 0;
10322 }
10323}
10324
10325#endif
10326
10327IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10328{
10329 RTUINT128U uSrc1 = *puSrc1;
10330
10331 if (uShift <= 15)
10332 {
10333 puDst->au16[0] = uSrc1.au16[0] << uShift;
10334 puDst->au16[1] = uSrc1.au16[1] << uShift;
10335 puDst->au16[2] = uSrc1.au16[2] << uShift;
10336 puDst->au16[3] = uSrc1.au16[3] << uShift;
10337 puDst->au16[4] = uSrc1.au16[4] << uShift;
10338 puDst->au16[5] = uSrc1.au16[5] << uShift;
10339 puDst->au16[6] = uSrc1.au16[6] << uShift;
10340 puDst->au16[7] = uSrc1.au16[7] << uShift;
10341 }
10342 else
10343 {
10344 puDst->au64[0] = 0;
10345 puDst->au64[1] = 0;
10346 }
10347}
10348
10349IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10350{
10351 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10352}
10353
10354IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10355{
10356 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, uShift);
10357}
10358
10359IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10360{
10361 RTUINT256U uSrc1 = *puSrc1;
10362
10363 if (uShift <= 15)
10364 {
10365 puDst->au16[0] = uSrc1.au16[0] << uShift;
10366 puDst->au16[1] = uSrc1.au16[1] << uShift;
10367 puDst->au16[2] = uSrc1.au16[2] << uShift;
10368 puDst->au16[3] = uSrc1.au16[3] << uShift;
10369 puDst->au16[4] = uSrc1.au16[4] << uShift;
10370 puDst->au16[5] = uSrc1.au16[5] << uShift;
10371 puDst->au16[6] = uSrc1.au16[6] << uShift;
10372 puDst->au16[7] = uSrc1.au16[7] << uShift;
10373 puDst->au16[8] = uSrc1.au16[8] << uShift;
10374 puDst->au16[9] = uSrc1.au16[9] << uShift;
10375 puDst->au16[10] = uSrc1.au16[10] << uShift;
10376 puDst->au16[11] = uSrc1.au16[11] << uShift;
10377 puDst->au16[12] = uSrc1.au16[12] << uShift;
10378 puDst->au16[13] = uSrc1.au16[13] << uShift;
10379 puDst->au16[14] = uSrc1.au16[14] << uShift;
10380 puDst->au16[15] = uSrc1.au16[15] << uShift;
10381 }
10382 else
10383 {
10384 puDst->au64[0] = 0;
10385 puDst->au64[1] = 0;
10386 puDst->au64[2] = 0;
10387 puDst->au64[3] = 0;
10388 }
10389}
10390
10391IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10392{
10393 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10394}
10395
10396IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10397{
10398 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, uShift);
10399}
10400
10401/*
10402 * PSRLD / VPSRLD
10403 */
10404#ifdef IEM_WITHOUT_ASSEMBLY
10405
10406IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10407{
10408 RTUINT64U uSrc1 = { *puDst };
10409 RTUINT64U uSrc2 = { *puSrc };
10410 RTUINT64U uDst;
10411
10412 if (uSrc2.au64[0] <= 31)
10413 {
10414 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10415 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10416 }
10417 else
10418 {
10419 uDst.au64[0] = 0;
10420 }
10421 *puDst = uDst.u;
10422}
10423
10424
10425IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10426{
10427 RTUINT64U uSrc1 = { *puDst };
10428 RTUINT64U uDst;
10429
10430 if (uShift <= 31)
10431 {
10432 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10433 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10434 }
10435 else
10436 {
10437 uDst.au64[0] = 0;
10438 }
10439 *puDst = uDst.u;
10440}
10441
10442
10443IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10444{
10445 RTUINT128U uSrc1 = *puDst;
10446
10447 if (puSrc->au64[0] <= 31)
10448 {
10449 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10450 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10451 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10452 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10453 }
10454 else
10455 {
10456 puDst->au64[0] = 0;
10457 puDst->au64[1] = 0;
10458 }
10459}
10460
10461IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10462{
10463 RTUINT128U uSrc1 = *puDst;
10464
10465 if (uShift <= 31)
10466 {
10467 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10468 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10469 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10470 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10471 }
10472 else
10473 {
10474 puDst->au64[0] = 0;
10475 puDst->au64[1] = 0;
10476 }
10477}
10478
10479#endif
10480
10481IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10482{
10483 RTUINT128U uSrc1 = *puSrc1;
10484
10485 if (uShift <= 31)
10486 {
10487 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10488 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10489 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10490 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10491 }
10492 else
10493 {
10494 puDst->au64[0] = 0;
10495 puDst->au64[1] = 0;
10496 }
10497}
10498
10499IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10500{
10501 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, uShift);
10502}
10503
10504IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10505{
10506 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10507}
10508
10509IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10510{
10511 RTUINT256U uSrc1 = *puSrc1;
10512
10513 if (uShift <= 31)
10514 {
10515 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10516 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10517 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10518 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10519 puDst->au32[4] = uSrc1.au32[4] >> uShift;
10520 puDst->au32[5] = uSrc1.au32[5] >> uShift;
10521 puDst->au32[6] = uSrc1.au32[6] >> uShift;
10522 puDst->au32[7] = uSrc1.au32[7] >> uShift;
10523 }
10524 else
10525 {
10526 puDst->au64[0] = 0;
10527 puDst->au64[1] = 0;
10528 puDst->au64[2] = 0;
10529 puDst->au64[3] = 0;
10530 }
10531}
10532
10533IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10534{
10535 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10536}
10537
10538IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10539{
10540 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, uShift);
10541}
10542
10543
10544/*
10545 * PSRAD / VPSRAD
10546 */
10547#ifdef IEM_WITHOUT_ASSEMBLY
10548
10549IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10550{
10551 RTUINT64U uSrc1 = { *puDst };
10552 RTUINT64U uSrc2 = { *puSrc };
10553 RTUINT64U uDst;
10554 uint8_t uShift;
10555
10556 uShift = RT_MIN(31, uSrc2.au64[0]);
10557
10558 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10559 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10560
10561 *puDst = uDst.u;
10562}
10563
10564
10565IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10566{
10567 RTUINT64U uSrc1 = { *puDst };
10568 RTUINT64U uDst;
10569
10570 uShift = RT_MIN(31, uShift);
10571
10572 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10573 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10574
10575 *puDst = uDst.u;
10576}
10577
10578
10579IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10580{
10581 RTUINT128U uSrc1 = *puDst;
10582 uint8_t uShift;
10583
10584 uShift = RT_MIN(31, puSrc->au64[0]);
10585
10586 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10587 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10588 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10589 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10590}
10591
10592IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10593{
10594 RTUINT128U uSrc1 = *puDst;
10595
10596 uShift = RT_MIN(31, uShift);
10597
10598 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10599 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10600 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10601 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10602}
10603
10604#endif
10605
10606IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10607{
10608 RTUINT128U uSrc1 = *puSrc1;
10609
10610 uShift = RT_MIN(31, uShift);
10611
10612 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10613 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10614 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10615 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10616}
10617
10618IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10619{
10620 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, uShift);
10621}
10622
10623IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10624{
10625 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10626}
10627
10628IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10629{
10630 RTUINT256U uSrc1 = *puSrc1;
10631
10632 uShift = RT_MIN(31, uShift);
10633
10634 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10635 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10636 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10637 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10638 puDst->ai32[4] = uSrc1.ai32[4] >> uShift;
10639 puDst->ai32[5] = uSrc1.ai32[5] >> uShift;
10640 puDst->ai32[6] = uSrc1.ai32[6] >> uShift;
10641 puDst->ai32[7] = uSrc1.ai32[7] >> uShift;
10642}
10643
10644IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10645{
10646 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10647}
10648
10649IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10650{
10651 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, uShift);
10652}
10653
10654
10655/*
10656 * PSLLD / VPSLLD
10657 */
10658#ifdef IEM_WITHOUT_ASSEMBLY
10659
10660IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10661{
10662 RTUINT64U uSrc1 = { *puDst };
10663 RTUINT64U uSrc2 = { *puSrc };
10664 RTUINT64U uDst;
10665
10666 if (uSrc2.au64[0] <= 31)
10667 {
10668 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10669 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10670 }
10671 else
10672 {
10673 uDst.au64[0] = 0;
10674 }
10675 *puDst = uDst.u;
10676}
10677
10678
10679IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10680{
10681 RTUINT64U uSrc1 = { *puDst };
10682 RTUINT64U uDst;
10683
10684 if (uShift <= 31)
10685 {
10686 uDst.au32[0] = uSrc1.au32[0] << uShift;
10687 uDst.au32[1] = uSrc1.au32[1] << uShift;
10688 }
10689 else
10690 {
10691 uDst.au64[0] = 0;
10692 }
10693 *puDst = uDst.u;
10694}
10695
10696
10697IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10698{
10699 RTUINT128U uSrc1 = *puDst;
10700
10701 if (puSrc->au64[0] <= 31)
10702 {
10703 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10704 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10705 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10706 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10707 }
10708 else
10709 {
10710 puDst->au64[0] = 0;
10711 puDst->au64[1] = 0;
10712 }
10713}
10714
10715IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10716{
10717 RTUINT128U uSrc1 = *puDst;
10718
10719 if (uShift <= 31)
10720 {
10721 puDst->au32[0] = uSrc1.au32[0] << uShift;
10722 puDst->au32[1] = uSrc1.au32[1] << uShift;
10723 puDst->au32[2] = uSrc1.au32[2] << uShift;
10724 puDst->au32[3] = uSrc1.au32[3] << uShift;
10725 }
10726 else
10727 {
10728 puDst->au64[0] = 0;
10729 puDst->au64[1] = 0;
10730 }
10731}
10732
10733#endif
10734
10735IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10736{
10737 RTUINT128U uSrc1 = *puSrc1;
10738
10739 if (uShift <= 31)
10740 {
10741 puDst->au32[0] = uSrc1.au32[0] << uShift;
10742 puDst->au32[1] = uSrc1.au32[1] << uShift;
10743 puDst->au32[2] = uSrc1.au32[2] << uShift;
10744 puDst->au32[3] = uSrc1.au32[3] << uShift;
10745 }
10746 else
10747 {
10748 puDst->au64[0] = 0;
10749 puDst->au64[1] = 0;
10750 }
10751}
10752
10753IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10754{
10755 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, uShift);
10756}
10757
10758IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10759{
10760 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10761}
10762
10763IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10764{
10765 RTUINT256U uSrc1 = *puSrc1;
10766
10767 if (uShift <= 31)
10768 {
10769 puDst->au32[0] = uSrc1.au32[0] << uShift;
10770 puDst->au32[1] = uSrc1.au32[1] << uShift;
10771 puDst->au32[2] = uSrc1.au32[2] << uShift;
10772 puDst->au32[3] = uSrc1.au32[3] << uShift;
10773 puDst->au32[4] = uSrc1.au32[4] << uShift;
10774 puDst->au32[5] = uSrc1.au32[5] << uShift;
10775 puDst->au32[6] = uSrc1.au32[6] << uShift;
10776 puDst->au32[7] = uSrc1.au32[7] << uShift;
10777 }
10778 else
10779 {
10780 puDst->au64[0] = 0;
10781 puDst->au64[1] = 0;
10782 puDst->au64[2] = 0;
10783 puDst->au64[3] = 0;
10784 }
10785}
10786
10787IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10788{
10789 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10790}
10791
10792IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10793{
10794 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, uShift);
10795}
10796
10797
10798/*
10799 * PSRLQ / VPSRLQ
10800 */
10801#ifdef IEM_WITHOUT_ASSEMBLY
10802
10803IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10804{
10805 RTUINT64U uSrc1 = { *puDst };
10806 RTUINT64U uSrc2 = { *puSrc };
10807 RTUINT64U uDst;
10808
10809 if (uSrc2.au64[0] <= 63)
10810 {
10811 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10812 }
10813 else
10814 {
10815 uDst.au64[0] = 0;
10816 }
10817 *puDst = uDst.u;
10818}
10819
10820
10821IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10822{
10823 RTUINT64U uSrc1 = { *puDst };
10824 RTUINT64U uDst;
10825
10826 if (uShift <= 63)
10827 {
10828 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10829 }
10830 else
10831 {
10832 uDst.au64[0] = 0;
10833 }
10834 *puDst = uDst.u;
10835}
10836
10837
10838IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10839{
10840 RTUINT128U uSrc1 = *puDst;
10841
10842 if (puSrc->au64[0] <= 63)
10843 {
10844 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10845 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10846 }
10847 else
10848 {
10849 puDst->au64[0] = 0;
10850 puDst->au64[1] = 0;
10851 }
10852}
10853
10854IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10855{
10856 RTUINT128U uSrc1 = *puDst;
10857
10858 if (uShift <= 63)
10859 {
10860 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10861 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10862 }
10863 else
10864 {
10865 puDst->au64[0] = 0;
10866 puDst->au64[1] = 0;
10867 }
10868}
10869
10870#endif
10871
10872IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10873{
10874 RTUINT128U uSrc1 = *puSrc1;
10875
10876 if (uShift <= 63)
10877 {
10878 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10879 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10880 }
10881 else
10882 {
10883 puDst->au64[0] = 0;
10884 puDst->au64[1] = 0;
10885 }
10886}
10887
10888IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10889{
10890 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, uShift);
10891}
10892
10893IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10894{
10895 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10896}
10897
10898IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10899{
10900 RTUINT256U uSrc1 = *puSrc1;
10901
10902 if (uShift <= 63)
10903 {
10904 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10905 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10906 puDst->au64[2] = uSrc1.au64[2] >> uShift;
10907 puDst->au64[3] = uSrc1.au64[3] >> uShift;
10908 }
10909 else
10910 {
10911 puDst->au64[0] = 0;
10912 puDst->au64[1] = 0;
10913 puDst->au64[2] = 0;
10914 puDst->au64[3] = 0;
10915 }
10916}
10917
10918IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10919{
10920 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10921}
10922
10923IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10924{
10925 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, uShift);
10926}
10927
10928
10929/*
10930 * PSLLQ / VPSLLQ
10931 */
10932#ifdef IEM_WITHOUT_ASSEMBLY
10933
10934IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10935{
10936 RTUINT64U uSrc1 = { *puDst };
10937 RTUINT64U uSrc2 = { *puSrc };
10938 RTUINT64U uDst;
10939
10940 if (uSrc2.au64[0] <= 63)
10941 {
10942 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10943 }
10944 else
10945 {
10946 uDst.au64[0] = 0;
10947 }
10948 *puDst = uDst.u;
10949}
10950
10951
10952IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10953{
10954 RTUINT64U uSrc1 = { *puDst };
10955 RTUINT64U uDst;
10956
10957 if (uShift <= 63)
10958 {
10959 uDst.au64[0] = uSrc1.au64[0] << uShift;
10960 }
10961 else
10962 {
10963 uDst.au64[0] = 0;
10964 }
10965 *puDst = uDst.u;
10966}
10967
10968
10969IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10970{
10971 RTUINT128U uSrc1 = *puDst;
10972
10973 if (puSrc->au64[0] <= 63)
10974 {
10975 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10976 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10977 }
10978 else
10979 {
10980 puDst->au64[0] = 0;
10981 puDst->au64[1] = 0;
10982 }
10983}
10984
10985IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10986{
10987 RTUINT128U uSrc1 = *puDst;
10988
10989 if (uShift <= 63)
10990 {
10991 puDst->au64[0] = uSrc1.au64[0] << uShift;
10992 puDst->au64[1] = uSrc1.au64[1] << uShift;
10993 }
10994 else
10995 {
10996 puDst->au64[0] = 0;
10997 puDst->au64[1] = 0;
10998 }
10999}
11000
11001#endif
11002
11003IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11004{
11005 RTUINT128U uSrc1 = *puSrc1;
11006
11007 if (uShift <= 63)
11008 {
11009 puDst->au64[0] = uSrc1.au64[0] << uShift;
11010 puDst->au64[1] = uSrc1.au64[1] << uShift;
11011 }
11012 else
11013 {
11014 puDst->au64[0] = 0;
11015 puDst->au64[1] = 0;
11016 }
11017}
11018
11019IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11020{
11021 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11022}
11023
11024IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11025{
11026 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, uShift);
11027}
11028
11029IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11030{
11031 RTUINT256U uSrc1 = *puSrc1;
11032
11033 if (uShift <= 63)
11034 {
11035 puDst->au64[0] = uSrc1.au64[0] << uShift;
11036 puDst->au64[1] = uSrc1.au64[1] << uShift;
11037 puDst->au64[2] = uSrc1.au64[2] << uShift;
11038 puDst->au64[3] = uSrc1.au64[3] << uShift;
11039 }
11040 else
11041 {
11042 puDst->au64[0] = 0;
11043 puDst->au64[1] = 0;
11044 puDst->au64[2] = 0;
11045 puDst->au64[3] = 0;
11046 }
11047}
11048
11049IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11050{
11051 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11052}
11053
11054IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11055{
11056 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, uShift);
11057}
11058
11059
11060/*
11061 * PSRLDQ / VPSRLDQ
11062 */
11063#ifdef IEM_WITHOUT_ASSEMBLY
11064
11065IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11066{
11067 if (uShift < 16)
11068 {
11069 RTUINT128U uSrc1 = *puDst;
11070 int i;
11071
11072 for (i = 0; i < 16 - uShift; ++i)
11073 puDst->au8[i] = uSrc1.au8[i + uShift];
11074 for (i = 16 - uShift; i < 16; ++i)
11075 puDst->au8[i] = 0;
11076 }
11077 else
11078 {
11079 puDst->au64[0] = 0;
11080 puDst->au64[1] = 0;
11081 }
11082}
11083
11084IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11085{
11086 if (uShift < 16)
11087 {
11088 RTUINT128U uSrc1 = *puSrc;
11089 int i;
11090
11091 for (i = 0; i < 16 - uShift; ++i)
11092 puDst->au8[i] = uSrc1.au8[i + uShift];
11093 for (i = 16 - uShift; i < 16; ++i)
11094 puDst->au8[i] = 0;
11095 }
11096 else
11097 {
11098 puDst->au64[0] = 0;
11099 puDst->au64[1] = 0;
11100 }
11101}
11102
11103IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11104{
11105 iemAImpl_vpsrldq_imm_u128(&puDst->au128[0], &puSrc->au128[0], uShift);
11106 iemAImpl_vpsrldq_imm_u128(&puDst->au128[1], &puSrc->au128[1], uShift);
11107}
11108#endif
11109
11110IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11111{
11112 if (uShift < 16)
11113 {
11114 RTUINT128U uSrc1 = *puSrc;
11115 int i;
11116
11117 for (i = 0; i < 16 - uShift; ++i)
11118 puDst->au8[i] = uSrc1.au8[i + uShift];
11119 for (i = 16 - uShift; i < 16; ++i)
11120 puDst->au8[i] = 0;
11121 }
11122 else
11123 {
11124 puDst->au64[0] = 0;
11125 puDst->au64[1] = 0;
11126 }
11127}
11128
11129IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11130{
11131 iemAImpl_vpsrldq_imm_u128_fallback(&puDst->au128[0], &puSrc->au128[0], uShift);
11132 iemAImpl_vpsrldq_imm_u128_fallback(&puDst->au128[1], &puSrc->au128[1], uShift);
11133}
11134
11135
11136/*
11137 * PSLLDQ / VPSLLDQ
11138 */
11139#ifdef IEM_WITHOUT_ASSEMBLY
11140
11141IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11142{
11143 if (uShift < 16)
11144 {
11145 RTUINT128U uSrc1 = *puDst;
11146 int i;
11147
11148 for (i = 0; i < uShift; ++i)
11149 puDst->au8[i] = 0;
11150 for (i = uShift; i < 16; ++i)
11151 puDst->au8[i] = uSrc1.au8[i - uShift];
11152 }
11153 else
11154 {
11155 puDst->au64[0] = 0;
11156 puDst->au64[1] = 0;
11157 }
11158}
11159
11160IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11161{
11162 if (uShift < 16)
11163 {
11164 RTUINT128U uSrc1 = *puSrc;
11165 int i;
11166
11167 for (i = 0; i < uShift; ++i)
11168 puDst->au8[i] = 0;
11169 for (i = uShift; i < 16; ++i)
11170 puDst->au8[i] = uSrc1.au8[i - uShift];
11171 }
11172 else
11173 {
11174 puDst->au64[0] = 0;
11175 puDst->au64[1] = 0;
11176 }
11177}
11178
11179IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11180{
11181 iemAImpl_vpslldq_imm_u128(&puDst->au128[0], &puSrc->au128[0], uShift);
11182 iemAImpl_vpslldq_imm_u128(&puDst->au128[1], &puSrc->au128[1], uShift);
11183}
11184
11185#endif
11186
11187IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11188{
11189 if (uShift < 16)
11190 {
11191 RTUINT128U uSrc1 = *puSrc;
11192 int i;
11193
11194 for (i = 0; i < uShift; ++i)
11195 puDst->au8[i] = 0;
11196 for (i = uShift; i < 16; ++i)
11197 puDst->au8[i] = uSrc1.au8[i - uShift];
11198 }
11199 else
11200 {
11201 puDst->au64[0] = 0;
11202 puDst->au64[1] = 0;
11203 }
11204}
11205
11206IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11207{
11208 iemAImpl_vpslldq_imm_u128_fallback(&puDst->au128[0], &puSrc->au128[0], uShift);
11209 iemAImpl_vpslldq_imm_u128_fallback(&puDst->au128[1], &puSrc->au128[1], uShift);
11210}
11211
11212
11213/*
11214 * VPSRLVD
11215 */
11216IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11217{
11218 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11219 {
11220 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11221 }
11222}
11223
11224IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11225{
11226 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11227 {
11228 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11229 }
11230}
11231
11232
11233/*
11234 * VPSRAVD
11235 */
11236IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11237{
11238 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11239 {
11240 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11241 }
11242}
11243
11244IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11245{
11246 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11247 {
11248 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11249 }
11250}
11251
11252
11253/*
11254 * VPSLLVD
11255 */
11256IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11257{
11258 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11259 {
11260 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11261 }
11262}
11263
11264IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11265{
11266 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11267 {
11268 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11269 }
11270}
11271
11272
11273/*
11274 * VPSRLVQ
11275 */
11276IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11277{
11278 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11279 {
11280 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11281 }
11282}
11283
11284IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11285{
11286 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11287 {
11288 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11289 }
11290}
11291
11292
11293/*
11294 * VPSLLVQ
11295 */
11296IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11297{
11298 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11299 {
11300 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11301 }
11302}
11303
11304IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11305{
11306 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11307 {
11308 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11309 }
11310}
11311
11312
11313/*
11314 * PMADDWD / VPMADDWD
11315 */
11316#ifdef IEM_WITHOUT_ASSEMBLY
11317
11318IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11319{
11320 RTUINT64U uSrc1 = { *puDst };
11321 RTUINT64U uSrc2 = { *puSrc };
11322 RTUINT64U uDst;
11323
11324 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11325 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11326 *puDst = uDst.u;
11327}
11328
11329
11330IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11331{
11332 RTUINT128U uSrc1 = *puDst;
11333
11334 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11335 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11336 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11337 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11338}
11339
11340#endif
11341
11342
11343IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
11344{
11345 RTUINT64U uSrc1 = { *puDst };
11346 RTUINT64U uSrc2 = { *puSrc };
11347 RTUINT64U uDst;
11348
11349 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11350 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11351 *puDst = uDst.u;
11352}
11353
11354
11355IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11356{
11357 RTUINT128U uSrc1 = *puDst;
11358
11359 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11360 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11361 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11362 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11363}
11364
11365
11366IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11367{
11368 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11369 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11370 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11371 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11372}
11373
11374
11375IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11376{
11377 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11378 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11379 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11380 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11381 puDst->ai32[4] = (int32_t)puSrc1->ai16[8] * puSrc2->ai16[8] + (int32_t)puSrc1->ai16[9] * puSrc2->ai16[9];
11382 puDst->ai32[5] = (int32_t)puSrc1->ai16[10] * puSrc2->ai16[10] + (int32_t)puSrc1->ai16[11] * puSrc2->ai16[11];
11383 puDst->ai32[6] = (int32_t)puSrc1->ai16[12] * puSrc2->ai16[12] + (int32_t)puSrc1->ai16[13] * puSrc2->ai16[13];
11384 puDst->ai32[7] = (int32_t)puSrc1->ai16[14] * puSrc2->ai16[14] + (int32_t)puSrc1->ai16[15] * puSrc2->ai16[15];
11385}
11386
11387
11388/*
11389 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
11390 */
11391#ifdef IEM_WITHOUT_ASSEMBLY
11392
11393IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(uint64_t *puDst, uint64_t const *puSrc))
11394{
11395 RTUINT64U uSrc1 = { *puDst };
11396 RTUINT64U uSrc2 = { *puSrc };
11397 RTUINT64U uDst;
11398
11399 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
11400 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
11401 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
11402 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
11403 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
11404 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
11405 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
11406 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
11407 *puDst = uDst.u;
11408}
11409
11410
11411IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11412{
11413 RTUINT128U uSrc1 = *puDst;
11414
11415 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
11416 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
11417 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
11418 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
11419 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
11420 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
11421 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
11422 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
11423 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
11424 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
11425 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
11426 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
11427 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
11428 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
11429 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
11430 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
11431}
11432
11433#endif
11434
11435
11436IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11437{
11438 RTUINT128U uSrc1 = *puDst;
11439
11440 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
11441 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
11442 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
11443 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
11444 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
11445 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
11446 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
11447 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
11448}
11449
11450
11451IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11452{
11453 RTUINT128U uSrc1 = *puDst;
11454
11455 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
11456 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
11457 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
11458 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
11459}
11460
11461
11462IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11463{
11464 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11465 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11466 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11467 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11468 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11469 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11470 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11471 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11472 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11473 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11474 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11475 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11476 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11477 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11478 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11479 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11480}
11481
11482
11483IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11484{
11485 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11486 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11487 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11488 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11489 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11490 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11491 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11492 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11493 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11494 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11495 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11496 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11497 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11498 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11499 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11500 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11501 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
11502 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
11503 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
11504 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
11505 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
11506 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
11507 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
11508 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
11509 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
11510 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
11511 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
11512 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
11513 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
11514 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
11515 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
11516 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
11517}
11518
11519
11520IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11521{
11522 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11523 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11524 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11525 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11526 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11527 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11528 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11529 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11530}
11531
11532
11533IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11534{
11535 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11536 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11537 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11538 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11539 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11540 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11541 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11542 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11543 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11544 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11545 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
11546 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
11547 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
11548 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
11549 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
11550 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
11551}
11552
11553
11554IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11555{
11556 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11557 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11558 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11559 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11560}
11561
11562
11563IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11564{
11565 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11566 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11567 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11568 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11569 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11570 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11571 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11572 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11573}
11574
11575
11576/*
11577 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11578 */
11579#ifdef IEM_WITHOUT_ASSEMBLY
11580
11581IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11582{
11583 RTUINT64U uSrc1 = { *puDst };
11584 RTUINT64U uSrc2 = { *puSrc };
11585 RTUINT64U uDst;
11586
11587 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11588 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11589 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11590 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11591 *puDst = uDst.u;
11592}
11593
11594
11595IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11596{
11597 RTUINT128U uSrc1 = *puDst;
11598
11599 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11600 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11601 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11602 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11603 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11604 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11605 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11606 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11607}
11608
11609#endif
11610
11611IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11612{
11613 RTUINT128U uSrc1 = *puDst;
11614
11615 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11616 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11617 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11618 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11619 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11620 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11621 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11622 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11623 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11624 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11625 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11626 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11627 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11628 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11629 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11630 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11631}
11632
11633
11634IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11635{
11636 RTUINT128U uSrc1 = *puDst;
11637
11638 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11639 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11640 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11641 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11642}
11643
11644
11645IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11646{
11647 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11648 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11649 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11650 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11651 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11652 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11653 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11654 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11655 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11656 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11657 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11658 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11659 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11660 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11661 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11662 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11663}
11664
11665
11666IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11667{
11668 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11669 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11670 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11671 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11672 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11673 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11674 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11675 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11676 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11677 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11678 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11679 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11680 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11681 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11682 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11683 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11684 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11685 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11686 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11687 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11688 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11689 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11690 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11691 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11692 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11693 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11694 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11695 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11696 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11697 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11698 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11699 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11700}
11701
11702
11703IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11704{
11705 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11706 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11707 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11708 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11709 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11710 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11711 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11712 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11713}
11714
11715
11716IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11717{
11718 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11719 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11720 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11721 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11722 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11723 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11724 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11725 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11726 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11727 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11728 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11729 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11730 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11731 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11732 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11733 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11734}
11735
11736
11737IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11738{
11739 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11740 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11741 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11742 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11743}
11744
11745
11746IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11747{
11748 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11749 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11750 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11751 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11752 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11753 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11754 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11755 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11756}
11757
11758
11759/*
11760 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11761 */
11762#ifdef IEM_WITHOUT_ASSEMBLY
11763
11764IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(uint64_t *puDst, uint64_t const *puSrc))
11765{
11766 RTUINT64U uSrc1 = { *puDst };
11767 RTUINT64U uSrc2 = { *puSrc };
11768 RTUINT64U uDst;
11769
11770 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11771 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11772 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11773 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11774 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11775 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11776 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11777 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11778 *puDst = uDst.u;
11779}
11780
11781
11782IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11783{
11784 RTUINT128U uSrc1 = *puDst;
11785
11786 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11787 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11788 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11789 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11790 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11791 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11792 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11793 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11794 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11795 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11796 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11797 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11798 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11799 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11800 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11801 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11802}
11803
11804#endif
11805
11806IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11807{
11808 RTUINT128U uSrc1 = *puDst;
11809
11810 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11811 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11812 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11813 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11814 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11815 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11816 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11817 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11818}
11819
11820
11821IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11822{
11823 RTUINT128U uSrc1 = *puDst;
11824
11825 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11826 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11827 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11828 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11829}
11830
11831
11832IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11833{
11834 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11835 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11836 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11837 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11838 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11839 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11840 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11841 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11842 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11843 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11844 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11845 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11846 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11847 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11848 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11849 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11850}
11851
11852
11853IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11854{
11855 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11856 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11857 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11858 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11859 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11860 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11861 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11862 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11863 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11864 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11865 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11866 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11867 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11868 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11869 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11870 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11871 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11872 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11873 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11874 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11875 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11876 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11877 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11878 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11879 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11880 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11881 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11882 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11883 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11884 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11885 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11886 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11887}
11888
11889
11890IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11891{
11892 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11893 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11894 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11895 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11896 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11897 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11898 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11899 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11900}
11901
11902
11903IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11904{
11905 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11906 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11907 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11908 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11909 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11910 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11911 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11912 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11913 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11914 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11915 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11916 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11917 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11918 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11919 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11920 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11921}
11922
11923
11924IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11925{
11926 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11927 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11928 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11929 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11930}
11931
11932
11933IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11934{
11935 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11936 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11937 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11938 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11939 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11940 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11941 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11942 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11943}
11944
11945
11946/*
11947 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11948 */
11949#ifdef IEM_WITHOUT_ASSEMBLY
11950
11951IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11952{
11953 RTUINT64U uSrc1 = { *puDst };
11954 RTUINT64U uSrc2 = { *puSrc };
11955 RTUINT64U uDst;
11956
11957 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11958 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11959 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11960 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11961 *puDst = uDst.u;
11962}
11963
11964
11965IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11966{
11967 RTUINT128U uSrc1 = *puDst;
11968
11969 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11970 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11971 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11972 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11973 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11974 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11975 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11976 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11977}
11978
11979#endif
11980
11981IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11982{
11983 RTUINT128U uSrc1 = *puDst;
11984
11985 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11986 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11987 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11988 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11989 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11990 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11991 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11992 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11993 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11994 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11995 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11996 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11997 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11998 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11999 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
12000 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
12001}
12002
12003
12004IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12005{
12006 RTUINT128U uSrc1 = *puDst;
12007
12008 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
12009 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
12010 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
12011 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
12012}
12013
12014
12015IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12016{
12017 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12018 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12019 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12020 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12021 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12022 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12023 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12024 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12025 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12026 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12027 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12028 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12029 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12030 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12031 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12032 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12033}
12034
12035
12036IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12037{
12038 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12039 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12040 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12041 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12042 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12043 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12044 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12045 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12046 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12047 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12048 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12049 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12050 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12051 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12052 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12053 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12054 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
12055 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
12056 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
12057 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
12058 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
12059 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
12060 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
12061 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
12062 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
12063 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
12064 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
12065 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
12066 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
12067 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
12068 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
12069 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
12070}
12071
12072
12073IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12074{
12075 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12076 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12077 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12078 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12079 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12080 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12081 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12082 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12083}
12084
12085
12086IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12087{
12088 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12089 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12090 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12091 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12092 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12093 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12094 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12095 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12096 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
12097 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
12098 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
12099 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
12100 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
12101 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
12102 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
12103 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
12104}
12105
12106
12107IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12108{
12109 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12110 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12111 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12112 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12113}
12114
12115
12116IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12117{
12118 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12119 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12120 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12121 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12122 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
12123 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
12124 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
12125 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
12126}
12127
12128
12129/*
12130 * PAVGB / VPAVGB / PAVGW / VPAVGW
12131 */
12132#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
12133#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
12134
12135#ifdef IEM_WITHOUT_ASSEMBLY
12136
12137IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12138{
12139 RTUINT64U uSrc1 = { *puDst };
12140 RTUINT64U uSrc2 = { *puSrc };
12141 RTUINT64U uDst;
12142
12143 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
12144 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
12145 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
12146 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
12147 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
12148 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
12149 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
12150 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
12151 *puDst = uDst.u;
12152}
12153
12154
12155IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12156{
12157 RTUINT128U uSrc1 = *puDst;
12158
12159 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12160 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12161 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12162 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12163 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12164 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12165 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12166 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12167 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12168 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12169 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12170 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12171 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12172 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12173 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12174 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12175}
12176
12177
12178IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12179{
12180 RTUINT64U uSrc1 = { *puDst };
12181 RTUINT64U uSrc2 = { *puSrc };
12182 RTUINT64U uDst;
12183
12184 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
12185 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
12186 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
12187 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
12188 *puDst = uDst.u;
12189}
12190
12191
12192IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12193{
12194 RTUINT128U uSrc1 = *puDst;
12195
12196 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
12197 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
12198 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
12199 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
12200 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
12201 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
12202 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
12203 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
12204}
12205
12206#endif
12207
12208IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12209{
12210 RTUINT128U uSrc1 = *puDst;
12211
12212 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12213 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12214 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12215 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12216 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12217 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12218 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12219 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12220 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12221 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12222 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12223 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12224 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12225 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12226 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12227 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12228}
12229
12230
12231IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12232{
12233 RTUINT128U uSrc1 = *puDst;
12234
12235 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12236 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12237 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12238 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12239 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12240 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12241 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12242 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12243 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12244 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12245 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12246 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12247 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12248 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12249 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12250 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12251}
12252
12253
12254IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12255{
12256 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12257 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12258 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12259 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12260 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12261 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12262 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12263 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12264 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12265 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12266 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12267 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12268 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12269 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12270 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12271 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12272}
12273
12274
12275IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12276{
12277 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12278 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12279 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12280 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12281 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12282 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12283 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12284 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12285 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12286 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12287 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12288 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12289 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12290 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12291 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12292 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12293 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
12294 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
12295 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
12296 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
12297 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
12298 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
12299 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
12300 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
12301 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
12302 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
12303 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
12304 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
12305 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
12306 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
12307 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
12308 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
12309}
12310
12311
12312IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12313{
12314 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12315 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12316 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12317 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12318 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12319 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12320 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12321 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12322}
12323
12324
12325IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12326{
12327 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12328 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12329 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12330 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12331 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12332 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12333 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12334 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12335 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
12336 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
12337 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
12338 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
12339 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
12340 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
12341 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
12342 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
12343}
12344
12345#undef PAVGB_EXEC
12346#undef PAVGW_EXEC
12347
12348
12349/*
12350 * PMOVMSKB / VPMOVMSKB
12351 */
12352#ifdef IEM_WITHOUT_ASSEMBLY
12353
12354IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
12355{
12356 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12357 uint64_t const uSrc = *pu64Src;
12358 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
12359 | ((uSrc >> (15-1)) & RT_BIT_64(1))
12360 | ((uSrc >> (23-2)) & RT_BIT_64(2))
12361 | ((uSrc >> (31-3)) & RT_BIT_64(3))
12362 | ((uSrc >> (39-4)) & RT_BIT_64(4))
12363 | ((uSrc >> (47-5)) & RT_BIT_64(5))
12364 | ((uSrc >> (55-6)) & RT_BIT_64(6))
12365 | ((uSrc >> (63-7)) & RT_BIT_64(7));
12366}
12367
12368
12369IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
12370{
12371 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12372 uint64_t const uSrc0 = pu128Src->QWords.qw0;
12373 uint64_t const uSrc1 = pu128Src->QWords.qw1;
12374 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12375 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12376 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12377 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12378 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12379 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12380 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12381 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12382 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12383 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12384 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12385 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12386 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12387 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12388 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12389 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
12390}
12391
12392#endif
12393
12394IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
12395{
12396 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12397 uint64_t const uSrc0 = puSrc->QWords.qw0;
12398 uint64_t const uSrc1 = puSrc->QWords.qw1;
12399 uint64_t const uSrc2 = puSrc->QWords.qw2;
12400 uint64_t const uSrc3 = puSrc->QWords.qw3;
12401 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12402 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12403 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12404 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12405 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12406 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12407 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12408 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12409 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12410 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12411 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12412 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12413 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12414 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12415 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12416 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
12417 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
12418 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
12419 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
12420 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
12421 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
12422 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
12423 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
12424 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
12425 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
12426 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
12427 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
12428 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
12429 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
12430 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
12431 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
12432 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
12433}
12434
12435
12436/*
12437 * [V]PSHUFB
12438 */
12439
12440IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
12441{
12442 RTUINT64U const uSrc = { *puSrc };
12443 RTUINT64U const uDstIn = { *puDst };
12444 ASMCompilerBarrier();
12445 RTUINT64U uDstOut = { 0 };
12446 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
12447 {
12448 uint8_t idxSrc = uSrc.au8[iByte];
12449 if (!(idxSrc & 0x80))
12450 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
12451 }
12452 *puDst = uDstOut.u;
12453}
12454
12455
12456IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12457{
12458 RTUINT128U const uSrc = *puSrc;
12459 RTUINT128U const uDstIn = *puDst;
12460 ASMCompilerBarrier();
12461 puDst->au64[0] = 0;
12462 puDst->au64[1] = 0;
12463 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12464 {
12465 uint8_t idxSrc = uSrc.au8[iByte];
12466 if (!(idxSrc & 0x80))
12467 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
12468 }
12469}
12470
12471
12472IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12473{
12474 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
12475 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
12476 ASMCompilerBarrier();
12477 puDst->au64[0] = 0;
12478 puDst->au64[1] = 0;
12479 for (unsigned iByte = 0; iByte < 16; iByte++)
12480 {
12481 uint8_t idxSrc = uSrc2.au8[iByte];
12482 if (!(idxSrc & 0x80))
12483 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12484 }
12485}
12486
12487
12488IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12489{
12490 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
12491 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
12492 ASMCompilerBarrier();
12493 puDst->au64[0] = 0;
12494 puDst->au64[1] = 0;
12495 puDst->au64[2] = 0;
12496 puDst->au64[3] = 0;
12497 for (unsigned iByte = 0; iByte < 16; iByte++)
12498 {
12499 uint8_t idxSrc = uSrc2.au8[iByte];
12500 if (!(idxSrc & 0x80))
12501 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12502 }
12503 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12504 {
12505 uint8_t idxSrc = uSrc2.au8[iByte];
12506 if (!(idxSrc & 0x80))
12507 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
12508 }
12509}
12510
12511
12512/*
12513 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
12514 */
12515#ifdef IEM_WITHOUT_ASSEMBLY
12516
12517IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
12518{
12519 uint64_t const uSrc = *puSrc;
12520 ASMCompilerBarrier();
12521 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12522 uSrc >> (((bEvil >> 2) & 3) * 16),
12523 uSrc >> (((bEvil >> 4) & 3) * 16),
12524 uSrc >> (((bEvil >> 6) & 3) * 16));
12525}
12526
12527
12528IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12529{
12530 puDst->QWords.qw0 = puSrc->QWords.qw0;
12531 uint64_t const uSrc = puSrc->QWords.qw1;
12532 ASMCompilerBarrier();
12533 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12534 uSrc >> (((bEvil >> 2) & 3) * 16),
12535 uSrc >> (((bEvil >> 4) & 3) * 16),
12536 uSrc >> (((bEvil >> 6) & 3) * 16));
12537}
12538
12539#endif
12540
12541IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12542{
12543 puDst->QWords.qw0 = puSrc->QWords.qw0;
12544 uint64_t const uSrc1 = puSrc->QWords.qw1;
12545 puDst->QWords.qw2 = puSrc->QWords.qw2;
12546 uint64_t const uSrc3 = puSrc->QWords.qw3;
12547 ASMCompilerBarrier();
12548 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12549 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12550 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12551 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12552 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12553 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12554 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12555 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12556}
12557
12558#ifdef IEM_WITHOUT_ASSEMBLY
12559IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12560{
12561 puDst->QWords.qw1 = puSrc->QWords.qw1;
12562 uint64_t const uSrc = puSrc->QWords.qw0;
12563 ASMCompilerBarrier();
12564 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12565 uSrc >> (((bEvil >> 2) & 3) * 16),
12566 uSrc >> (((bEvil >> 4) & 3) * 16),
12567 uSrc >> (((bEvil >> 6) & 3) * 16));
12568
12569}
12570#endif
12571
12572
12573IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12574{
12575 puDst->QWords.qw3 = puSrc->QWords.qw3;
12576 uint64_t const uSrc2 = puSrc->QWords.qw2;
12577 puDst->QWords.qw1 = puSrc->QWords.qw1;
12578 uint64_t const uSrc0 = puSrc->QWords.qw0;
12579 ASMCompilerBarrier();
12580 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12581 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12582 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12583 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12584 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12585 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12586 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12587 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12588
12589}
12590
12591
12592#ifdef IEM_WITHOUT_ASSEMBLY
12593IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12594{
12595 RTUINT128U const uSrc = *puSrc;
12596 ASMCompilerBarrier();
12597 puDst->au32[0] = uSrc.au32[bEvil & 3];
12598 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12599 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12600 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12601}
12602#endif
12603
12604
12605IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12606{
12607 RTUINT256U const uSrc = *puSrc;
12608 ASMCompilerBarrier();
12609 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12610 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12611 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12612 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12613 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12614 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12615 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12616 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12617}
12618
12619
12620/*
12621 * PUNPCKHBW - high bytes -> words
12622 */
12623#ifdef IEM_WITHOUT_ASSEMBLY
12624
12625IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12626{
12627 RTUINT64U const uSrc2 = { *puSrc };
12628 RTUINT64U const uSrc1 = { *puDst };
12629 ASMCompilerBarrier();
12630 RTUINT64U uDstOut;
12631 uDstOut.au8[0] = uSrc1.au8[4];
12632 uDstOut.au8[1] = uSrc2.au8[4];
12633 uDstOut.au8[2] = uSrc1.au8[5];
12634 uDstOut.au8[3] = uSrc2.au8[5];
12635 uDstOut.au8[4] = uSrc1.au8[6];
12636 uDstOut.au8[5] = uSrc2.au8[6];
12637 uDstOut.au8[6] = uSrc1.au8[7];
12638 uDstOut.au8[7] = uSrc2.au8[7];
12639 *puDst = uDstOut.u;
12640}
12641
12642
12643IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12644{
12645 RTUINT128U const uSrc2 = *puSrc;
12646 RTUINT128U const uSrc1 = *puDst;
12647 ASMCompilerBarrier();
12648 RTUINT128U uDstOut;
12649 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12650 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12651 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12652 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12653 uDstOut.au8[ 4] = uSrc1.au8[10];
12654 uDstOut.au8[ 5] = uSrc2.au8[10];
12655 uDstOut.au8[ 6] = uSrc1.au8[11];
12656 uDstOut.au8[ 7] = uSrc2.au8[11];
12657 uDstOut.au8[ 8] = uSrc1.au8[12];
12658 uDstOut.au8[ 9] = uSrc2.au8[12];
12659 uDstOut.au8[10] = uSrc1.au8[13];
12660 uDstOut.au8[11] = uSrc2.au8[13];
12661 uDstOut.au8[12] = uSrc1.au8[14];
12662 uDstOut.au8[13] = uSrc2.au8[14];
12663 uDstOut.au8[14] = uSrc1.au8[15];
12664 uDstOut.au8[15] = uSrc2.au8[15];
12665 *puDst = uDstOut;
12666}
12667
12668#endif
12669
12670IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12671{
12672 RTUINT128U const uSrc2 = *puSrc2;
12673 RTUINT128U const uSrc1 = *puSrc1;
12674 ASMCompilerBarrier();
12675 RTUINT128U uDstOut;
12676 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12677 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12678 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12679 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12680 uDstOut.au8[ 4] = uSrc1.au8[10];
12681 uDstOut.au8[ 5] = uSrc2.au8[10];
12682 uDstOut.au8[ 6] = uSrc1.au8[11];
12683 uDstOut.au8[ 7] = uSrc2.au8[11];
12684 uDstOut.au8[ 8] = uSrc1.au8[12];
12685 uDstOut.au8[ 9] = uSrc2.au8[12];
12686 uDstOut.au8[10] = uSrc1.au8[13];
12687 uDstOut.au8[11] = uSrc2.au8[13];
12688 uDstOut.au8[12] = uSrc1.au8[14];
12689 uDstOut.au8[13] = uSrc2.au8[14];
12690 uDstOut.au8[14] = uSrc1.au8[15];
12691 uDstOut.au8[15] = uSrc2.au8[15];
12692 *puDst = uDstOut;
12693}
12694
12695
12696IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12697{
12698 RTUINT256U const uSrc2 = *puSrc2;
12699 RTUINT256U const uSrc1 = *puSrc1;
12700 ASMCompilerBarrier();
12701 RTUINT256U uDstOut;
12702 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12703 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12704 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12705 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12706 uDstOut.au8[ 4] = uSrc1.au8[10];
12707 uDstOut.au8[ 5] = uSrc2.au8[10];
12708 uDstOut.au8[ 6] = uSrc1.au8[11];
12709 uDstOut.au8[ 7] = uSrc2.au8[11];
12710 uDstOut.au8[ 8] = uSrc1.au8[12];
12711 uDstOut.au8[ 9] = uSrc2.au8[12];
12712 uDstOut.au8[10] = uSrc1.au8[13];
12713 uDstOut.au8[11] = uSrc2.au8[13];
12714 uDstOut.au8[12] = uSrc1.au8[14];
12715 uDstOut.au8[13] = uSrc2.au8[14];
12716 uDstOut.au8[14] = uSrc1.au8[15];
12717 uDstOut.au8[15] = uSrc2.au8[15];
12718 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12719 uDstOut.au8[16] = uSrc1.au8[24];
12720 uDstOut.au8[17] = uSrc2.au8[24];
12721 uDstOut.au8[18] = uSrc1.au8[25];
12722 uDstOut.au8[19] = uSrc2.au8[25];
12723 uDstOut.au8[20] = uSrc1.au8[26];
12724 uDstOut.au8[21] = uSrc2.au8[26];
12725 uDstOut.au8[22] = uSrc1.au8[27];
12726 uDstOut.au8[23] = uSrc2.au8[27];
12727 uDstOut.au8[24] = uSrc1.au8[28];
12728 uDstOut.au8[25] = uSrc2.au8[28];
12729 uDstOut.au8[26] = uSrc1.au8[29];
12730 uDstOut.au8[27] = uSrc2.au8[29];
12731 uDstOut.au8[28] = uSrc1.au8[30];
12732 uDstOut.au8[29] = uSrc2.au8[30];
12733 uDstOut.au8[30] = uSrc1.au8[31];
12734 uDstOut.au8[31] = uSrc2.au8[31];
12735 *puDst = uDstOut;
12736}
12737
12738
12739/*
12740 * PUNPCKHBW - high words -> dwords
12741 */
12742#ifdef IEM_WITHOUT_ASSEMBLY
12743
12744IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12745{
12746 RTUINT64U const uSrc2 = { *puSrc };
12747 RTUINT64U const uSrc1 = { *puDst };
12748 ASMCompilerBarrier();
12749 RTUINT64U uDstOut;
12750 uDstOut.au16[0] = uSrc1.au16[2];
12751 uDstOut.au16[1] = uSrc2.au16[2];
12752 uDstOut.au16[2] = uSrc1.au16[3];
12753 uDstOut.au16[3] = uSrc2.au16[3];
12754 *puDst = uDstOut.u;
12755}
12756
12757
12758IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12759{
12760 RTUINT128U const uSrc2 = *puSrc;
12761 RTUINT128U const uSrc1 = *puDst;
12762 ASMCompilerBarrier();
12763 RTUINT128U uDstOut;
12764 uDstOut.au16[0] = uSrc1.au16[4];
12765 uDstOut.au16[1] = uSrc2.au16[4];
12766 uDstOut.au16[2] = uSrc1.au16[5];
12767 uDstOut.au16[3] = uSrc2.au16[5];
12768 uDstOut.au16[4] = uSrc1.au16[6];
12769 uDstOut.au16[5] = uSrc2.au16[6];
12770 uDstOut.au16[6] = uSrc1.au16[7];
12771 uDstOut.au16[7] = uSrc2.au16[7];
12772 *puDst = uDstOut;
12773}
12774
12775#endif
12776
12777IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12778{
12779 RTUINT128U const uSrc2 = *puSrc2;
12780 RTUINT128U const uSrc1 = *puSrc1;
12781 ASMCompilerBarrier();
12782 RTUINT128U uDstOut;
12783 uDstOut.au16[0] = uSrc1.au16[4];
12784 uDstOut.au16[1] = uSrc2.au16[4];
12785 uDstOut.au16[2] = uSrc1.au16[5];
12786 uDstOut.au16[3] = uSrc2.au16[5];
12787 uDstOut.au16[4] = uSrc1.au16[6];
12788 uDstOut.au16[5] = uSrc2.au16[6];
12789 uDstOut.au16[6] = uSrc1.au16[7];
12790 uDstOut.au16[7] = uSrc2.au16[7];
12791 *puDst = uDstOut;
12792}
12793
12794
12795IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12796{
12797 RTUINT256U const uSrc2 = *puSrc2;
12798 RTUINT256U const uSrc1 = *puSrc1;
12799 ASMCompilerBarrier();
12800 RTUINT256U uDstOut;
12801 uDstOut.au16[0] = uSrc1.au16[4];
12802 uDstOut.au16[1] = uSrc2.au16[4];
12803 uDstOut.au16[2] = uSrc1.au16[5];
12804 uDstOut.au16[3] = uSrc2.au16[5];
12805 uDstOut.au16[4] = uSrc1.au16[6];
12806 uDstOut.au16[5] = uSrc2.au16[6];
12807 uDstOut.au16[6] = uSrc1.au16[7];
12808 uDstOut.au16[7] = uSrc2.au16[7];
12809
12810 uDstOut.au16[8] = uSrc1.au16[12];
12811 uDstOut.au16[9] = uSrc2.au16[12];
12812 uDstOut.au16[10] = uSrc1.au16[13];
12813 uDstOut.au16[11] = uSrc2.au16[13];
12814 uDstOut.au16[12] = uSrc1.au16[14];
12815 uDstOut.au16[13] = uSrc2.au16[14];
12816 uDstOut.au16[14] = uSrc1.au16[15];
12817 uDstOut.au16[15] = uSrc2.au16[15];
12818 *puDst = uDstOut;
12819}
12820
12821
12822/*
12823 * PUNPCKHBW - high dwords -> qword(s)
12824 */
12825#ifdef IEM_WITHOUT_ASSEMBLY
12826
12827IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12828{
12829 RTUINT64U const uSrc2 = { *puSrc };
12830 RTUINT64U const uSrc1 = { *puDst };
12831 ASMCompilerBarrier();
12832 RTUINT64U uDstOut;
12833 uDstOut.au32[0] = uSrc1.au32[1];
12834 uDstOut.au32[1] = uSrc2.au32[1];
12835 *puDst = uDstOut.u;
12836}
12837
12838
12839IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12840{
12841 RTUINT128U const uSrc2 = *puSrc;
12842 RTUINT128U const uSrc1 = *puDst;
12843 ASMCompilerBarrier();
12844 RTUINT128U uDstOut;
12845 uDstOut.au32[0] = uSrc1.au32[2];
12846 uDstOut.au32[1] = uSrc2.au32[2];
12847 uDstOut.au32[2] = uSrc1.au32[3];
12848 uDstOut.au32[3] = uSrc2.au32[3];
12849 *puDst = uDstOut;
12850}
12851
12852#endif
12853
12854IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12855{
12856 RTUINT128U const uSrc2 = *puSrc2;
12857 RTUINT128U const uSrc1 = *puSrc1;
12858 ASMCompilerBarrier();
12859 RTUINT128U uDstOut;
12860 uDstOut.au32[0] = uSrc1.au32[2];
12861 uDstOut.au32[1] = uSrc2.au32[2];
12862 uDstOut.au32[2] = uSrc1.au32[3];
12863 uDstOut.au32[3] = uSrc2.au32[3];
12864 *puDst = uDstOut;
12865}
12866
12867
12868IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12869{
12870 RTUINT256U const uSrc2 = *puSrc2;
12871 RTUINT256U const uSrc1 = *puSrc1;
12872 ASMCompilerBarrier();
12873 RTUINT256U uDstOut;
12874 uDstOut.au32[0] = uSrc1.au32[2];
12875 uDstOut.au32[1] = uSrc2.au32[2];
12876 uDstOut.au32[2] = uSrc1.au32[3];
12877 uDstOut.au32[3] = uSrc2.au32[3];
12878
12879 uDstOut.au32[4] = uSrc1.au32[6];
12880 uDstOut.au32[5] = uSrc2.au32[6];
12881 uDstOut.au32[6] = uSrc1.au32[7];
12882 uDstOut.au32[7] = uSrc2.au32[7];
12883 *puDst = uDstOut;
12884}
12885
12886
12887/*
12888 * PUNPCKHQDQ -> High qwords -> double qword(s).
12889 */
12890#ifdef IEM_WITHOUT_ASSEMBLY
12891IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12892{
12893 RTUINT128U const uSrc2 = *puSrc;
12894 RTUINT128U const uSrc1 = *puDst;
12895 ASMCompilerBarrier();
12896 RTUINT128U uDstOut;
12897 uDstOut.au64[0] = uSrc1.au64[1];
12898 uDstOut.au64[1] = uSrc2.au64[1];
12899 *puDst = uDstOut;
12900}
12901#endif
12902
12903
12904IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12905{
12906 RTUINT128U const uSrc2 = *puSrc2;
12907 RTUINT128U const uSrc1 = *puSrc1;
12908 ASMCompilerBarrier();
12909 RTUINT128U uDstOut;
12910 uDstOut.au64[0] = uSrc1.au64[1];
12911 uDstOut.au64[1] = uSrc2.au64[1];
12912 *puDst = uDstOut;
12913}
12914
12915
12916IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12917{
12918 RTUINT256U const uSrc2 = *puSrc2;
12919 RTUINT256U const uSrc1 = *puSrc1;
12920 ASMCompilerBarrier();
12921 RTUINT256U uDstOut;
12922 uDstOut.au64[0] = uSrc1.au64[1];
12923 uDstOut.au64[1] = uSrc2.au64[1];
12924
12925 uDstOut.au64[2] = uSrc1.au64[3];
12926 uDstOut.au64[3] = uSrc2.au64[3];
12927 *puDst = uDstOut;
12928}
12929
12930
12931/*
12932 * PUNPCKLBW - low bytes -> words
12933 */
12934#ifdef IEM_WITHOUT_ASSEMBLY
12935
12936IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12937{
12938 RTUINT64U const uSrc2 = { *puSrc };
12939 RTUINT64U const uSrc1 = { *puDst };
12940 ASMCompilerBarrier();
12941 RTUINT64U uDstOut;
12942 uDstOut.au8[0] = uSrc1.au8[0];
12943 uDstOut.au8[1] = uSrc2.au8[0];
12944 uDstOut.au8[2] = uSrc1.au8[1];
12945 uDstOut.au8[3] = uSrc2.au8[1];
12946 uDstOut.au8[4] = uSrc1.au8[2];
12947 uDstOut.au8[5] = uSrc2.au8[2];
12948 uDstOut.au8[6] = uSrc1.au8[3];
12949 uDstOut.au8[7] = uSrc2.au8[3];
12950 *puDst = uDstOut.u;
12951}
12952
12953
12954IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12955{
12956 RTUINT128U const uSrc2 = *puSrc;
12957 RTUINT128U const uSrc1 = *puDst;
12958 ASMCompilerBarrier();
12959 RTUINT128U uDstOut;
12960 uDstOut.au8[ 0] = uSrc1.au8[0];
12961 uDstOut.au8[ 1] = uSrc2.au8[0];
12962 uDstOut.au8[ 2] = uSrc1.au8[1];
12963 uDstOut.au8[ 3] = uSrc2.au8[1];
12964 uDstOut.au8[ 4] = uSrc1.au8[2];
12965 uDstOut.au8[ 5] = uSrc2.au8[2];
12966 uDstOut.au8[ 6] = uSrc1.au8[3];
12967 uDstOut.au8[ 7] = uSrc2.au8[3];
12968 uDstOut.au8[ 8] = uSrc1.au8[4];
12969 uDstOut.au8[ 9] = uSrc2.au8[4];
12970 uDstOut.au8[10] = uSrc1.au8[5];
12971 uDstOut.au8[11] = uSrc2.au8[5];
12972 uDstOut.au8[12] = uSrc1.au8[6];
12973 uDstOut.au8[13] = uSrc2.au8[6];
12974 uDstOut.au8[14] = uSrc1.au8[7];
12975 uDstOut.au8[15] = uSrc2.au8[7];
12976 *puDst = uDstOut;
12977}
12978
12979#endif
12980
12981IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12982{
12983 RTUINT128U const uSrc2 = *puSrc2;
12984 RTUINT128U const uSrc1 = *puSrc1;
12985 ASMCompilerBarrier();
12986 RTUINT128U uDstOut;
12987 uDstOut.au8[ 0] = uSrc1.au8[0];
12988 uDstOut.au8[ 1] = uSrc2.au8[0];
12989 uDstOut.au8[ 2] = uSrc1.au8[1];
12990 uDstOut.au8[ 3] = uSrc2.au8[1];
12991 uDstOut.au8[ 4] = uSrc1.au8[2];
12992 uDstOut.au8[ 5] = uSrc2.au8[2];
12993 uDstOut.au8[ 6] = uSrc1.au8[3];
12994 uDstOut.au8[ 7] = uSrc2.au8[3];
12995 uDstOut.au8[ 8] = uSrc1.au8[4];
12996 uDstOut.au8[ 9] = uSrc2.au8[4];
12997 uDstOut.au8[10] = uSrc1.au8[5];
12998 uDstOut.au8[11] = uSrc2.au8[5];
12999 uDstOut.au8[12] = uSrc1.au8[6];
13000 uDstOut.au8[13] = uSrc2.au8[6];
13001 uDstOut.au8[14] = uSrc1.au8[7];
13002 uDstOut.au8[15] = uSrc2.au8[7];
13003 *puDst = uDstOut;
13004}
13005
13006
13007IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13008{
13009 RTUINT256U const uSrc2 = *puSrc2;
13010 RTUINT256U const uSrc1 = *puSrc1;
13011 ASMCompilerBarrier();
13012 RTUINT256U uDstOut;
13013 uDstOut.au8[ 0] = uSrc1.au8[0];
13014 uDstOut.au8[ 1] = uSrc2.au8[0];
13015 uDstOut.au8[ 2] = uSrc1.au8[1];
13016 uDstOut.au8[ 3] = uSrc2.au8[1];
13017 uDstOut.au8[ 4] = uSrc1.au8[2];
13018 uDstOut.au8[ 5] = uSrc2.au8[2];
13019 uDstOut.au8[ 6] = uSrc1.au8[3];
13020 uDstOut.au8[ 7] = uSrc2.au8[3];
13021 uDstOut.au8[ 8] = uSrc1.au8[4];
13022 uDstOut.au8[ 9] = uSrc2.au8[4];
13023 uDstOut.au8[10] = uSrc1.au8[5];
13024 uDstOut.au8[11] = uSrc2.au8[5];
13025 uDstOut.au8[12] = uSrc1.au8[6];
13026 uDstOut.au8[13] = uSrc2.au8[6];
13027 uDstOut.au8[14] = uSrc1.au8[7];
13028 uDstOut.au8[15] = uSrc2.au8[7];
13029 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
13030 uDstOut.au8[16] = uSrc1.au8[16];
13031 uDstOut.au8[17] = uSrc2.au8[16];
13032 uDstOut.au8[18] = uSrc1.au8[17];
13033 uDstOut.au8[19] = uSrc2.au8[17];
13034 uDstOut.au8[20] = uSrc1.au8[18];
13035 uDstOut.au8[21] = uSrc2.au8[18];
13036 uDstOut.au8[22] = uSrc1.au8[19];
13037 uDstOut.au8[23] = uSrc2.au8[19];
13038 uDstOut.au8[24] = uSrc1.au8[20];
13039 uDstOut.au8[25] = uSrc2.au8[20];
13040 uDstOut.au8[26] = uSrc1.au8[21];
13041 uDstOut.au8[27] = uSrc2.au8[21];
13042 uDstOut.au8[28] = uSrc1.au8[22];
13043 uDstOut.au8[29] = uSrc2.au8[22];
13044 uDstOut.au8[30] = uSrc1.au8[23];
13045 uDstOut.au8[31] = uSrc2.au8[23];
13046 *puDst = uDstOut;
13047}
13048
13049
13050/*
13051 * PUNPCKLBW - low words -> dwords
13052 */
13053#ifdef IEM_WITHOUT_ASSEMBLY
13054
13055IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
13056{
13057 RTUINT64U const uSrc2 = { *puSrc };
13058 RTUINT64U const uSrc1 = { *puDst };
13059 ASMCompilerBarrier();
13060 RTUINT64U uDstOut;
13061 uDstOut.au16[0] = uSrc1.au16[0];
13062 uDstOut.au16[1] = uSrc2.au16[0];
13063 uDstOut.au16[2] = uSrc1.au16[1];
13064 uDstOut.au16[3] = uSrc2.au16[1];
13065 *puDst = uDstOut.u;
13066}
13067
13068
13069IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13070{
13071 RTUINT128U const uSrc2 = *puSrc;
13072 RTUINT128U const uSrc1 = *puDst;
13073 ASMCompilerBarrier();
13074 RTUINT128U uDstOut;
13075 uDstOut.au16[0] = uSrc1.au16[0];
13076 uDstOut.au16[1] = uSrc2.au16[0];
13077 uDstOut.au16[2] = uSrc1.au16[1];
13078 uDstOut.au16[3] = uSrc2.au16[1];
13079 uDstOut.au16[4] = uSrc1.au16[2];
13080 uDstOut.au16[5] = uSrc2.au16[2];
13081 uDstOut.au16[6] = uSrc1.au16[3];
13082 uDstOut.au16[7] = uSrc2.au16[3];
13083 *puDst = uDstOut;
13084}
13085
13086#endif
13087
13088IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13089{
13090 RTUINT128U const uSrc2 = *puSrc2;
13091 RTUINT128U const uSrc1 = *puSrc1;
13092 ASMCompilerBarrier();
13093 RTUINT128U uDstOut;
13094 uDstOut.au16[0] = uSrc1.au16[0];
13095 uDstOut.au16[1] = uSrc2.au16[0];
13096 uDstOut.au16[2] = uSrc1.au16[1];
13097 uDstOut.au16[3] = uSrc2.au16[1];
13098 uDstOut.au16[4] = uSrc1.au16[2];
13099 uDstOut.au16[5] = uSrc2.au16[2];
13100 uDstOut.au16[6] = uSrc1.au16[3];
13101 uDstOut.au16[7] = uSrc2.au16[3];
13102 *puDst = uDstOut;
13103}
13104
13105
13106IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13107{
13108 RTUINT256U const uSrc2 = *puSrc2;
13109 RTUINT256U const uSrc1 = *puSrc1;
13110 ASMCompilerBarrier();
13111 RTUINT256U uDstOut;
13112 uDstOut.au16[0] = uSrc1.au16[0];
13113 uDstOut.au16[1] = uSrc2.au16[0];
13114 uDstOut.au16[2] = uSrc1.au16[1];
13115 uDstOut.au16[3] = uSrc2.au16[1];
13116 uDstOut.au16[4] = uSrc1.au16[2];
13117 uDstOut.au16[5] = uSrc2.au16[2];
13118 uDstOut.au16[6] = uSrc1.au16[3];
13119 uDstOut.au16[7] = uSrc2.au16[3];
13120
13121 uDstOut.au16[8] = uSrc1.au16[8];
13122 uDstOut.au16[9] = uSrc2.au16[8];
13123 uDstOut.au16[10] = uSrc1.au16[9];
13124 uDstOut.au16[11] = uSrc2.au16[9];
13125 uDstOut.au16[12] = uSrc1.au16[10];
13126 uDstOut.au16[13] = uSrc2.au16[10];
13127 uDstOut.au16[14] = uSrc1.au16[11];
13128 uDstOut.au16[15] = uSrc2.au16[11];
13129 *puDst = uDstOut;
13130}
13131
13132
13133/*
13134 * PUNPCKLBW - low dwords -> qword(s)
13135 */
13136#ifdef IEM_WITHOUT_ASSEMBLY
13137
13138IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
13139{
13140 RTUINT64U const uSrc2 = { *puSrc };
13141 RTUINT64U const uSrc1 = { *puDst };
13142 ASMCompilerBarrier();
13143 RTUINT64U uDstOut;
13144 uDstOut.au32[0] = uSrc1.au32[0];
13145 uDstOut.au32[1] = uSrc2.au32[0];
13146 *puDst = uDstOut.u;
13147}
13148
13149
13150IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13151{
13152 RTUINT128U const uSrc2 = *puSrc;
13153 RTUINT128U const uSrc1 = *puDst;
13154 ASMCompilerBarrier();
13155 RTUINT128U uDstOut;
13156 uDstOut.au32[0] = uSrc1.au32[0];
13157 uDstOut.au32[1] = uSrc2.au32[0];
13158 uDstOut.au32[2] = uSrc1.au32[1];
13159 uDstOut.au32[3] = uSrc2.au32[1];
13160 *puDst = uDstOut;
13161}
13162
13163#endif
13164
13165IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13166{
13167 RTUINT128U const uSrc2 = *puSrc2;
13168 RTUINT128U const uSrc1 = *puSrc1;
13169 ASMCompilerBarrier();
13170 RTUINT128U uDstOut;
13171 uDstOut.au32[0] = uSrc1.au32[0];
13172 uDstOut.au32[1] = uSrc2.au32[0];
13173 uDstOut.au32[2] = uSrc1.au32[1];
13174 uDstOut.au32[3] = uSrc2.au32[1];
13175 *puDst = uDstOut;
13176}
13177
13178
13179IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13180{
13181 RTUINT256U const uSrc2 = *puSrc2;
13182 RTUINT256U const uSrc1 = *puSrc1;
13183 ASMCompilerBarrier();
13184 RTUINT256U uDstOut;
13185 uDstOut.au32[0] = uSrc1.au32[0];
13186 uDstOut.au32[1] = uSrc2.au32[0];
13187 uDstOut.au32[2] = uSrc1.au32[1];
13188 uDstOut.au32[3] = uSrc2.au32[1];
13189
13190 uDstOut.au32[4] = uSrc1.au32[4];
13191 uDstOut.au32[5] = uSrc2.au32[4];
13192 uDstOut.au32[6] = uSrc1.au32[5];
13193 uDstOut.au32[7] = uSrc2.au32[5];
13194 *puDst = uDstOut;
13195}
13196
13197
13198/*
13199 * PUNPCKLQDQ -> Low qwords -> double qword(s).
13200 */
13201#ifdef IEM_WITHOUT_ASSEMBLY
13202IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13203{
13204 RTUINT128U const uSrc2 = *puSrc;
13205 RTUINT128U const uSrc1 = *puDst;
13206 ASMCompilerBarrier();
13207 RTUINT128U uDstOut;
13208 uDstOut.au64[0] = uSrc1.au64[0];
13209 uDstOut.au64[1] = uSrc2.au64[0];
13210 *puDst = uDstOut;
13211}
13212#endif
13213
13214
13215IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13216{
13217 RTUINT128U const uSrc2 = *puSrc2;
13218 RTUINT128U const uSrc1 = *puSrc1;
13219 ASMCompilerBarrier();
13220 RTUINT128U uDstOut;
13221 uDstOut.au64[0] = uSrc1.au64[0];
13222 uDstOut.au64[1] = uSrc2.au64[0];
13223 *puDst = uDstOut;
13224}
13225
13226
13227IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13228{
13229 RTUINT256U const uSrc2 = *puSrc2;
13230 RTUINT256U const uSrc1 = *puSrc1;
13231 ASMCompilerBarrier();
13232 RTUINT256U uDstOut;
13233 uDstOut.au64[0] = uSrc1.au64[0];
13234 uDstOut.au64[1] = uSrc2.au64[0];
13235
13236 uDstOut.au64[2] = uSrc1.au64[2];
13237 uDstOut.au64[3] = uSrc2.au64[2];
13238 *puDst = uDstOut;
13239}
13240
13241
13242/*
13243 * PACKSSWB - signed words -> signed bytes
13244 */
13245
13246#ifdef IEM_WITHOUT_ASSEMBLY
13247
13248IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13249{
13250 RTUINT64U const uSrc2 = { *puSrc };
13251 RTUINT64U const uSrc1 = { *puDst };
13252 ASMCompilerBarrier();
13253 RTUINT64U uDstOut;
13254 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13255 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13256 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13257 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13258 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13259 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13260 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13261 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13262 *puDst = uDstOut.u;
13263}
13264
13265
13266IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13267{
13268 RTUINT128U const uSrc2 = *puSrc;
13269 RTUINT128U const uSrc1 = *puDst;
13270 ASMCompilerBarrier();
13271 RTUINT128U uDstOut;
13272 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13273 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13274 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13275 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13276 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13277 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13278 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13279 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13280 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13281 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13282 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13283 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13284 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13285 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13286 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13287 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13288 *puDst = uDstOut;
13289}
13290
13291#endif
13292
13293IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13294{
13295 RTUINT128U const uSrc2 = *puSrc2;
13296 RTUINT128U const uSrc1 = *puSrc1;
13297 ASMCompilerBarrier();
13298 RTUINT128U uDstOut;
13299 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13300 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13301 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13302 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13303 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13304 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13305 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13306 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13307 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13308 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13309 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13310 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13311 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13312 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13313 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13314 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13315 *puDst = uDstOut;
13316}
13317
13318
13319IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13320{
13321 RTUINT256U const uSrc2 = *puSrc2;
13322 RTUINT256U const uSrc1 = *puSrc1;
13323 ASMCompilerBarrier();
13324 RTUINT256U uDstOut;
13325 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13326 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13327 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13328 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13329 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13330 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13331 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13332 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13333 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13334 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13335 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13336 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13337 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13338 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13339 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13340 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13341
13342 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
13343 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
13344 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
13345 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
13346 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
13347 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
13348 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
13349 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
13350 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
13351 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
13352 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
13353 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
13354 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
13355 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
13356 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
13357 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
13358 *puDst = uDstOut;
13359}
13360
13361
13362/*
13363 * PACKUSWB - signed words -> unsigned bytes
13364 */
13365#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
13366 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
13367 ? (uint8_t)(a_iWord) \
13368 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
13369
13370#ifdef IEM_WITHOUT_ASSEMBLY
13371
13372IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13373{
13374 RTUINT64U const uSrc2 = { *puSrc };
13375 RTUINT64U const uSrc1 = { *puDst };
13376 ASMCompilerBarrier();
13377 RTUINT64U uDstOut;
13378 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13379 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13380 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13381 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13382 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13383 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13384 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13385 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13386 *puDst = uDstOut.u;
13387}
13388
13389
13390IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13391{
13392 RTUINT128U const uSrc2 = *puSrc;
13393 RTUINT128U const uSrc1 = *puDst;
13394 ASMCompilerBarrier();
13395 RTUINT128U uDstOut;
13396 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13397 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13398 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13399 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13400 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13401 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13402 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13403 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13404 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13405 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13406 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13407 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13408 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13409 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13410 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13411 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13412 *puDst = uDstOut;
13413}
13414
13415#endif
13416
13417IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13418{
13419 RTUINT128U const uSrc2 = *puSrc2;
13420 RTUINT128U const uSrc1 = *puSrc1;
13421 ASMCompilerBarrier();
13422 RTUINT128U uDstOut;
13423 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13424 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13425 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13426 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13427 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13428 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13429 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13430 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13431 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13432 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13433 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13434 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13435 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13436 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13437 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13438 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13439 *puDst = uDstOut;
13440}
13441
13442
13443IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13444{
13445 RTUINT256U const uSrc2 = *puSrc2;
13446 RTUINT256U const uSrc1 = *puSrc1;
13447 ASMCompilerBarrier();
13448 RTUINT256U uDstOut;
13449 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13450 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13451 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13452 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13453 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13454 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13455 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13456 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13457 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13458 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13459 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13460 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13461 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13462 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13463 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13464 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13465
13466 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
13467 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
13468 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
13469 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
13470 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
13471 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
13472 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
13473 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
13474 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
13475 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
13476 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
13477 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
13478 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
13479 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
13480 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
13481 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
13482 *puDst = uDstOut;
13483}
13484
13485
13486/*
13487 * PACKSSDW - signed dwords -> signed words
13488 */
13489
13490#ifdef IEM_WITHOUT_ASSEMBLY
13491
13492IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13493{
13494 RTUINT64U const uSrc2 = { *puSrc };
13495 RTUINT64U const uSrc1 = { *puDst };
13496 ASMCompilerBarrier();
13497 RTUINT64U uDstOut;
13498 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13499 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13500 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13501 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13502 *puDst = uDstOut.u;
13503}
13504
13505
13506IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13507{
13508 RTUINT128U const uSrc2 = *puSrc;
13509 RTUINT128U const uSrc1 = *puDst;
13510 ASMCompilerBarrier();
13511 RTUINT128U uDstOut;
13512 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13513 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13514 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13515 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13516 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13517 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13518 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13519 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13520 *puDst = uDstOut;
13521}
13522
13523#endif
13524
13525IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13526{
13527 RTUINT128U const uSrc2 = *puSrc2;
13528 RTUINT128U const uSrc1 = *puSrc1;
13529 ASMCompilerBarrier();
13530 RTUINT128U uDstOut;
13531 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13532 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13533 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13534 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13535 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13536 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13537 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13538 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13539 *puDst = uDstOut;
13540}
13541
13542
13543IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13544{
13545 RTUINT256U const uSrc2 = *puSrc2;
13546 RTUINT256U const uSrc1 = *puSrc1;
13547 ASMCompilerBarrier();
13548 RTUINT256U uDstOut;
13549 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13550 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13551 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13552 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13553 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13554 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13555 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13556 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13557
13558 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13559 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13560 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13561 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13562 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13563 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13564 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13565 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13566 *puDst = uDstOut;
13567}
13568
13569
13570/*
13571 * PACKUSDW - signed dwords -> unsigned words
13572 */
13573#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13574 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13575 ? (uint16_t)(a_iDword) \
13576 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13577
13578#ifdef IEM_WITHOUT_ASSEMBLY
13579IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13580{
13581 RTUINT128U const uSrc2 = *puSrc;
13582 RTUINT128U const uSrc1 = *puDst;
13583 ASMCompilerBarrier();
13584 RTUINT128U uDstOut;
13585 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13586 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13587 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13588 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13589 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13590 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13591 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13592 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13593 *puDst = uDstOut;
13594}
13595#endif
13596
13597IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13598{
13599 RTUINT128U const uSrc2 = *puSrc2;
13600 RTUINT128U const uSrc1 = *puSrc1;
13601 ASMCompilerBarrier();
13602 RTUINT128U uDstOut;
13603 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13604 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13605 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13606 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13607 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13608 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13609 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13610 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13611 *puDst = uDstOut;
13612}
13613
13614
13615IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13616{
13617 RTUINT256U const uSrc2 = *puSrc2;
13618 RTUINT256U const uSrc1 = *puSrc1;
13619 ASMCompilerBarrier();
13620 RTUINT256U uDstOut;
13621 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13622 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13623 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13624 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13625 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13626 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13627 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13628 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13629
13630 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13631 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13632 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13633 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13634 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13635 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13636 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13637 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13638 *puDst = uDstOut;
13639}
13640
13641
13642/*
13643 * [V]PABSB / [V]PABSW / [V]PABSD
13644 */
13645
13646IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13647{
13648 RTUINT64U const uSrc = { *puSrc };
13649 RTUINT64U uDstOut = { 0 };
13650
13651 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13652 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13653 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13654 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13655 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13656 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13657 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13658 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13659 *puDst = uDstOut.u;
13660}
13661
13662
13663IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13664{
13665 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13666 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13667 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13668 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13669 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13670 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13671 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13672 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13673 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13674 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13675 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13676 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13677 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13678 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13679 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13680 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13681}
13682
13683
13684IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13685{
13686 RTUINT64U const uSrc = { *puSrc };
13687 RTUINT64U uDstOut = { 0 };
13688
13689 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13690 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13691 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13692 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13693 *puDst = uDstOut.u;
13694}
13695
13696
13697IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13698{
13699 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13700 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13701 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13702 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13703 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13704 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13705 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13706 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13707}
13708
13709
13710IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13711{
13712 RTUINT64U const uSrc = { *puSrc };
13713 RTUINT64U uDstOut = { 0 };
13714
13715 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13716 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13717 *puDst = uDstOut.u;
13718}
13719
13720
13721IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13722{
13723 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13724 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13725 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13726 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13727}
13728
13729
13730IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13731{
13732 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13733 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13734 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13735 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13736 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13737 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13738 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13739 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13740 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13741 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13742 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13743 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13744 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13745 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13746 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13747 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13748}
13749
13750
13751IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13752{
13753 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13754 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13755 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13756 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13757 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13758 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13759 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13760 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13761 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13762 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13763 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13764 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13765 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13766 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13767 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13768 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13769 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13770 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13771 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13772 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13773 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13774 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13775 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13776 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13777 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13778 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13779 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13780 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13781 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13782 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13783 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13784 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13785}
13786
13787
13788IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13789{
13790 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13791 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13792 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13793 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13794 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13795 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13796 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13797 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13798}
13799
13800
13801IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13802{
13803 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13804 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13805 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13806 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13807 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13808 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13809 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13810 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13811 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13812 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13813 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13814 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13815 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13816 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13817 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13818 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13819}
13820
13821
13822IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13823{
13824 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13825 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13826 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13827 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13828}
13829
13830
13831IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13832{
13833 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13834 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13835 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13836 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13837 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13838 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13839 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13840 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13841}
13842
13843
13844/*
13845 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13846 */
13847IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13848{
13849 RTUINT64U uSrc1 = { *puDst };
13850 RTUINT64U uSrc2 = { *puSrc };
13851 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13852
13853 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13854 {
13855 if (uSrc2.ai8[i] < 0)
13856 uDst.ai8[i] = -uSrc1.ai8[i];
13857 else if (uSrc2.ai8[i] == 0)
13858 uDst.ai8[i] = 0;
13859 else /* uSrc2.ai8[i] > 0 */
13860 uDst.ai8[i] = uSrc1.ai8[i];
13861 }
13862
13863 *puDst = uDst.u;
13864}
13865
13866
13867IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13868{
13869 RTUINT128U uSrc1 = *puDst;
13870
13871 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13872 {
13873 if (puSrc->ai8[i] < 0)
13874 puDst->ai8[i] = -uSrc1.ai8[i];
13875 else if (puSrc->ai8[i] == 0)
13876 puDst->ai8[i] = 0;
13877 else /* puSrc->ai8[i] > 0 */
13878 puDst->ai8[i] = uSrc1.ai8[i];
13879 }
13880}
13881
13882
13883IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13884{
13885 RTUINT64U uSrc1 = { *puDst };
13886 RTUINT64U uSrc2 = { *puSrc };
13887 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13888
13889 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13890 {
13891 if (uSrc2.ai16[i] < 0)
13892 uDst.ai16[i] = -uSrc1.ai16[i];
13893 else if (uSrc2.ai16[i] == 0)
13894 uDst.ai16[i] = 0;
13895 else /* uSrc2.ai16[i] > 0 */
13896 uDst.ai16[i] = uSrc1.ai16[i];
13897 }
13898
13899 *puDst = uDst.u;
13900}
13901
13902
13903IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13904{
13905 RTUINT128U uSrc1 = *puDst;
13906
13907 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13908 {
13909 if (puSrc->ai16[i] < 0)
13910 puDst->ai16[i] = -uSrc1.ai16[i];
13911 else if (puSrc->ai16[i] == 0)
13912 puDst->ai16[i] = 0;
13913 else /* puSrc->ai16[i] > 0 */
13914 puDst->ai16[i] = uSrc1.ai16[i];
13915 }
13916}
13917
13918
13919IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13920{
13921 RTUINT64U uSrc1 = { *puDst };
13922 RTUINT64U uSrc2 = { *puSrc };
13923 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13924
13925 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13926 {
13927 if (uSrc2.ai32[i] < 0)
13928 uDst.ai32[i] = -uSrc1.ai32[i];
13929 else if (uSrc2.ai32[i] == 0)
13930 uDst.ai32[i] = 0;
13931 else /* uSrc2.ai32[i] > 0 */
13932 uDst.ai32[i] = uSrc1.ai32[i];
13933 }
13934
13935 *puDst = uDst.u;
13936}
13937
13938
13939IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13940{
13941 RTUINT128U uSrc1 = *puDst;
13942
13943 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13944 {
13945 if (puSrc->ai32[i] < 0)
13946 puDst->ai32[i] = -uSrc1.ai32[i];
13947 else if (puSrc->ai32[i] == 0)
13948 puDst->ai32[i] = 0;
13949 else /* puSrc->ai32[i] > 0 */
13950 puDst->ai32[i] = uSrc1.ai32[i];
13951 }
13952}
13953
13954
13955IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13956{
13957 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13958 {
13959 if (puSrc2->ai8[i] < 0)
13960 puDst->ai8[i] = -puSrc1->ai8[i];
13961 else if (puSrc2->ai8[i] == 0)
13962 puDst->ai8[i] = 0;
13963 else /* puSrc2->ai8[i] > 0 */
13964 puDst->ai8[i] = puSrc1->ai8[i];
13965 }
13966}
13967
13968
13969IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13970{
13971 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13972 {
13973 if (puSrc2->ai8[i] < 0)
13974 puDst->ai8[i] = -puSrc1->ai8[i];
13975 else if (puSrc2->ai8[i] == 0)
13976 puDst->ai8[i] = 0;
13977 else /* puSrc2->ai8[i] > 0 */
13978 puDst->ai8[i] = puSrc1->ai8[i];
13979 }
13980}
13981
13982
13983IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13984{
13985 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13986 {
13987 if (puSrc2->ai16[i] < 0)
13988 puDst->ai16[i] = -puSrc1->ai16[i];
13989 else if (puSrc2->ai16[i] == 0)
13990 puDst->ai16[i] = 0;
13991 else /* puSrc2->ai16[i] > 0 */
13992 puDst->ai16[i] = puSrc1->ai16[i];
13993 }
13994}
13995
13996
13997IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13998{
13999 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14000 {
14001 if (puSrc2->ai16[i] < 0)
14002 puDst->ai16[i] = -puSrc1->ai16[i];
14003 else if (puSrc2->ai16[i] == 0)
14004 puDst->ai16[i] = 0;
14005 else /* puSrc2->ai16[i] > 0 */
14006 puDst->ai16[i] = puSrc1->ai16[i];
14007 }
14008}
14009
14010
14011IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14012{
14013 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14014 {
14015 if (puSrc2->ai32[i] < 0)
14016 puDst->ai32[i] = -puSrc1->ai32[i];
14017 else if (puSrc2->ai32[i] == 0)
14018 puDst->ai32[i] = 0;
14019 else /* puSrc2->ai32[i] > 0 */
14020 puDst->ai32[i] = puSrc1->ai32[i];
14021 }
14022}
14023
14024
14025IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14026{
14027 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14028 {
14029 if (puSrc2->ai32[i] < 0)
14030 puDst->ai32[i] = -puSrc1->ai32[i];
14031 else if (puSrc2->ai32[i] == 0)
14032 puDst->ai32[i] = 0;
14033 else /* puSrc2->ai32[i] > 0 */
14034 puDst->ai32[i] = puSrc1->ai32[i];
14035 }
14036}
14037
14038
14039/*
14040 * PHADDW / VPHADDW / PHADDD / VPHADDD
14041 */
14042IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14043{
14044 RTUINT64U uSrc1 = { *puDst };
14045 RTUINT64U uSrc2 = { *puSrc };
14046 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14047
14048 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14049 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14050 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
14051 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
14052 *puDst = uDst.u;
14053}
14054
14055
14056IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14057{
14058 RTUINT128U uSrc1 = *puDst;
14059
14060 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14061 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14062 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
14063 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
14064
14065 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
14066 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
14067 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
14068 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
14069}
14070
14071
14072IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14073{
14074 RTUINT64U uSrc1 = { *puDst };
14075 RTUINT64U uSrc2 = { *puSrc };
14076 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14077
14078 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14079 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
14080 *puDst = uDst.u;
14081}
14082
14083
14084IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14085{
14086 RTUINT128U uSrc1 = *puDst;
14087
14088 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14089 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
14090
14091 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
14092 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
14093}
14094
14095
14096IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14097{
14098 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14099
14100 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
14101 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
14102 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
14103 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
14104
14105 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
14106 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
14107 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
14108 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
14109
14110 puDst->au64[0] = uDst.au64[0];
14111 puDst->au64[1] = uDst.au64[1];
14112}
14113
14114
14115IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14116{
14117 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14118
14119 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
14120 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
14121 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
14122 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
14123 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
14124 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
14125 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
14126 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
14127
14128 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
14129 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
14130 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
14131 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
14132 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
14133 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
14134 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
14135 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
14136
14137 puDst->au64[0] = uDst.au64[0];
14138 puDst->au64[1] = uDst.au64[1];
14139 puDst->au64[2] = uDst.au64[2];
14140 puDst->au64[3] = uDst.au64[3];
14141}
14142
14143
14144IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14145{
14146 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14147
14148 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
14149 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
14150
14151 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
14152 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
14153
14154 puDst->au64[0] = uDst.au64[0];
14155 puDst->au64[1] = uDst.au64[1];
14156}
14157
14158
14159IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14160{
14161 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14162
14163 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
14164 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
14165 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
14166 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
14167
14168 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
14169 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
14170 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
14171 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
14172
14173 puDst->au64[0] = uDst.au64[0];
14174 puDst->au64[1] = uDst.au64[1];
14175 puDst->au64[2] = uDst.au64[2];
14176 puDst->au64[3] = uDst.au64[3];
14177}
14178
14179
14180/*
14181 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
14182 */
14183IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14184{
14185 RTUINT64U uSrc1 = { *puDst };
14186 RTUINT64U uSrc2 = { *puSrc };
14187 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14188
14189 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14190 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14191 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
14192 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
14193 *puDst = uDst.u;
14194}
14195
14196
14197IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14198{
14199 RTUINT128U uSrc1 = *puDst;
14200
14201 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14202 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14203 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
14204 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
14205
14206 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
14207 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
14208 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
14209 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
14210}
14211
14212
14213IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14214{
14215 RTUINT64U uSrc1 = { *puDst };
14216 RTUINT64U uSrc2 = { *puSrc };
14217 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14218
14219 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14220 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
14221 *puDst = uDst.u;
14222}
14223
14224
14225IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14226{
14227 RTUINT128U uSrc1 = *puDst;
14228
14229 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14230 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
14231
14232 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
14233 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
14234}
14235
14236
14237IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14238{
14239 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14240
14241 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
14242 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
14243 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
14244 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
14245
14246 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
14247 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
14248 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
14249 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
14250
14251 puDst->au64[0] = uDst.au64[0];
14252 puDst->au64[1] = uDst.au64[1];
14253}
14254
14255
14256IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14257{
14258 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14259
14260 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
14261 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
14262 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
14263 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
14264 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
14265 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
14266 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
14267 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
14268
14269 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
14270 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
14271 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
14272 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
14273 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
14274 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
14275 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
14276 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
14277
14278 puDst->au64[0] = uDst.au64[0];
14279 puDst->au64[1] = uDst.au64[1];
14280 puDst->au64[2] = uDst.au64[2];
14281 puDst->au64[3] = uDst.au64[3];
14282}
14283
14284
14285IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14286{
14287 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14288
14289 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
14290 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
14291
14292 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
14293 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
14294
14295 puDst->au64[0] = uDst.au64[0];
14296 puDst->au64[1] = uDst.au64[1];
14297}
14298
14299
14300IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14301{
14302 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14303
14304 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
14305 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
14306 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
14307 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
14308
14309 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
14310 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
14311 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
14312 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
14313
14314 puDst->au64[0] = uDst.au64[0];
14315 puDst->au64[1] = uDst.au64[1];
14316 puDst->au64[2] = uDst.au64[2];
14317 puDst->au64[3] = uDst.au64[3];
14318}
14319
14320
14321/*
14322 * PHADDSW / VPHADDSW
14323 */
14324IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14325{
14326 RTUINT64U uSrc1 = { *puDst };
14327 RTUINT64U uSrc2 = { *puSrc };
14328 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14329
14330 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14331 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14332 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
14333 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
14334 *puDst = uDst.u;
14335}
14336
14337
14338IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14339{
14340 RTUINT128U uSrc1 = *puDst;
14341
14342 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14343 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14344 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
14345 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
14346
14347 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
14348 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
14349 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
14350 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
14351}
14352
14353
14354IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14355{
14356 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14357
14358 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
14359 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
14360 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
14361 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
14362
14363 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
14364 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
14365 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
14366 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
14367
14368 puDst->au64[0] = uDst.au64[0];
14369 puDst->au64[1] = uDst.au64[1];
14370}
14371
14372
14373IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14374{
14375 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14376
14377 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
14378 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
14379 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
14380 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
14381 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
14382 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
14383 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
14384 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
14385
14386 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
14387 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
14388 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
14389 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
14390 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
14391 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
14392 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
14393 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
14394
14395 puDst->au64[0] = uDst.au64[0];
14396 puDst->au64[1] = uDst.au64[1];
14397 puDst->au64[2] = uDst.au64[2];
14398 puDst->au64[3] = uDst.au64[3];
14399}
14400
14401
14402/*
14403 * PHSUBSW / VPHSUBSW
14404 */
14405IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14406{
14407 RTUINT64U uSrc1 = { *puDst };
14408 RTUINT64U uSrc2 = { *puSrc };
14409 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14410
14411 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14412 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14413 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
14414 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
14415 *puDst = uDst.u;
14416}
14417
14418
14419IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14420{
14421 RTUINT128U uSrc1 = *puDst;
14422
14423 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14424 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14425 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
14426 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
14427
14428 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
14429 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
14430 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
14431 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
14432}
14433
14434
14435IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14436{
14437 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14438
14439 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
14440 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
14441 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
14442 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
14443
14444 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
14445 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
14446 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
14447 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
14448
14449 puDst->au64[0] = uDst.au64[0];
14450 puDst->au64[1] = uDst.au64[1];
14451}
14452
14453
14454IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14455{
14456 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14457
14458 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
14459 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
14460 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
14461 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
14462 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
14463 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
14464 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
14465 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
14466
14467 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
14468 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
14469 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
14470 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
14471 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
14472 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
14473 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
14474 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
14475
14476 puDst->au64[0] = uDst.au64[0];
14477 puDst->au64[1] = uDst.au64[1];
14478 puDst->au64[2] = uDst.au64[2];
14479 puDst->au64[3] = uDst.au64[3];
14480}
14481
14482
14483/*
14484 * PMADDUBSW / VPMADDUBSW
14485 */
14486IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14487{
14488 RTUINT64U uSrc1 = { *puDst };
14489 RTUINT64U uSrc2 = { *puSrc };
14490 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14491
14492 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14493 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14494 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14495 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14496 *puDst = uDst.u;
14497}
14498
14499
14500IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14501{
14502 RTUINT128U uSrc1 = *puDst;
14503
14504 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14505 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14506 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14507 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14508 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14509 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14510 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14511 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14512}
14513
14514
14515IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14516{
14517 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14518
14519 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14520 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14521 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14522 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14523 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14524 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14525 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14526 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14527
14528 puDst->au64[0] = uDst.au64[0];
14529 puDst->au64[1] = uDst.au64[1];
14530}
14531
14532
14533IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14534{
14535 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14536
14537 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14538 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14539 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14540 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14541 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14542 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14543 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14544 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14545 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14546 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14547 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14548 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14549 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14550 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14551 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14552 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14553
14554 puDst->au64[0] = uDst.au64[0];
14555 puDst->au64[1] = uDst.au64[1];
14556 puDst->au64[2] = uDst.au64[2];
14557 puDst->au64[3] = uDst.au64[3];
14558}
14559
14560
14561/*
14562 * PMULHRSW / VPMULHRSW
14563 */
14564#define DO_PMULHRSW(a_Src1, a_Src2) \
14565 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14566
14567IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14568{
14569 RTUINT64U uSrc1 = { *puDst };
14570 RTUINT64U uSrc2 = { *puSrc };
14571 RTUINT64U uDst;
14572
14573 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14574 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14575 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14576 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14577 *puDst = uDst.u;
14578}
14579
14580
14581IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14582{
14583 RTUINT128U uSrc1 = *puDst;
14584
14585 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14586 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14587 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14588 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14589 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14590 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14591 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14592 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14593}
14594
14595
14596IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14597{
14598 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14599
14600 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14601 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14602 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14603 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14604 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14605 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14606 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14607 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14608
14609 puDst->au64[0] = uDst.au64[0];
14610 puDst->au64[1] = uDst.au64[1];
14611}
14612
14613
14614IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14615{
14616 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14617
14618 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14619 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14620 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14621 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14622 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14623 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14624 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14625 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14626 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14627 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14628 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14629 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14630 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14631 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14632 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14633 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14634
14635 puDst->au64[0] = uDst.au64[0];
14636 puDst->au64[1] = uDst.au64[1];
14637 puDst->au64[2] = uDst.au64[2];
14638 puDst->au64[3] = uDst.au64[3];
14639}
14640
14641
14642/*
14643 * PSADBW / VPSADBW
14644 */
14645#ifdef IEM_WITHOUT_ASSEMBLY
14646
14647IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14648{
14649 RTUINT64U uSrc1 = { *puDst };
14650 RTUINT64U uSrc2 = { *puSrc };
14651 RTUINT64U uDst;
14652 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14653 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14654 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14655 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14656 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14657 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14658 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14659 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14660
14661 uDst.au64[0] = 0;
14662 uDst.au16[0] = uSum;
14663 *puDst = uDst.u;
14664}
14665
14666
14667IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14668{
14669 RTUINT128U uSrc1 = *puDst;
14670
14671 puDst->au64[0] = 0;
14672 puDst->au64[1] = 0;
14673
14674 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14675 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14676 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14677 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14678 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14679 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14680 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14681 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14682 puDst->au16[0] = uSum;
14683
14684 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14685 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14686 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14687 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14688 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14689 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14690 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14691 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14692 puDst->au16[4] = uSum;
14693}
14694
14695#endif
14696
14697IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14698{
14699 RTUINT128U uSrc1 = *puSrc1;
14700 RTUINT128U uSrc2 = *puSrc2;
14701
14702 puDst->au64[0] = 0;
14703 puDst->au64[1] = 0;
14704
14705 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14706 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14707 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14708 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14709 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14710 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14711 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14712 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14713 puDst->au16[0] = uSum;
14714
14715 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14716 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14717 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14718 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14719 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14720 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14721 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14722 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14723 puDst->au16[4] = uSum;
14724}
14725
14726IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14727{
14728 RTUINT256U uSrc1 = *puSrc1;
14729 RTUINT256U uSrc2 = *puSrc2;
14730
14731 puDst->au64[0] = 0;
14732 puDst->au64[1] = 0;
14733 puDst->au64[2] = 0;
14734 puDst->au64[3] = 0;
14735
14736 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14737 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14738 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14739 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14740 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14741 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14742 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14743 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14744 puDst->au16[0] = uSum;
14745
14746 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14747 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14748 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14749 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14750 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14751 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14752 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14753 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14754 puDst->au16[4] = uSum;
14755
14756 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14757 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14758 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14759 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14760 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14761 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14762 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14763 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14764 puDst->au16[8] = uSum;
14765
14766 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14767 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14768 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14769 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14770 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14771 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14772 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14773 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14774 puDst->au16[12] = uSum;
14775}
14776
14777
14778/*
14779 * PMULDQ / VPMULDQ
14780 */
14781IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14782{
14783 RTUINT128U uSrc1 = *puDst;
14784
14785 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14786 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14787}
14788
14789IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14790{
14791 RTUINT128U uSrc1 = *puSrc1;
14792 RTUINT128U uSrc2 = *puSrc2;
14793
14794 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14795 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14796}
14797
14798IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14799{
14800 RTUINT256U uSrc1 = *puSrc1;
14801 RTUINT256U uSrc2 = *puSrc2;
14802
14803 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14804 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14805 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14806 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14807}
14808
14809
14810/*
14811 * PMULUDQ / VPMULUDQ
14812 */
14813#ifdef IEM_WITHOUT_ASSEMBLY
14814
14815IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(uint64_t *puDst, uint64_t const *puSrc))
14816{
14817 RTUINT64U uSrc1 = { *puDst };
14818 RTUINT64U uSrc2 = { *puSrc };
14819 ASMCompilerBarrier();
14820 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14821}
14822
14823
14824IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14825{
14826 RTUINT128U uSrc1 = *puDst;
14827 RTUINT128U uSrc2 = *puSrc;
14828 ASMCompilerBarrier();
14829 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14830 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14831}
14832
14833#endif
14834
14835IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14836{
14837 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14838 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14839 ASMCompilerBarrier();
14840 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14841 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14842}
14843
14844
14845IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14846{
14847 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14848 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14849 ASMCompilerBarrier();
14850 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14851 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14852 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14853 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14854}
14855
14856
14857/*
14858 * UNPCKLPS / VUNPCKLPS
14859 */
14860#ifdef IEM_WITHOUT_ASSEMBLY
14861IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14862{
14863 RTUINT128U uSrc1 = *puDst;
14864 RTUINT128U uSrc2 = *puSrc;
14865 ASMCompilerBarrier();
14866 puDst->au32[0] = uSrc1.au32[0];
14867 puDst->au32[1] = uSrc2.au32[0];
14868 puDst->au32[2] = uSrc1.au32[1];
14869 puDst->au32[3] = uSrc2.au32[1];
14870}
14871
14872#endif
14873
14874IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14875{
14876 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14877 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14878 ASMCompilerBarrier();
14879 puDst->au32[0] = uSrc1.au32[0];
14880 puDst->au32[1] = uSrc2.au32[0];
14881 puDst->au32[2] = uSrc1.au32[1];
14882 puDst->au32[3] = uSrc2.au32[1];
14883}
14884
14885
14886IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14887{
14888 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14889 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14890 ASMCompilerBarrier();
14891 puDst->au32[0] = uSrc1.au32[0];
14892 puDst->au32[1] = uSrc2.au32[0];
14893 puDst->au32[2] = uSrc1.au32[1];
14894 puDst->au32[3] = uSrc2.au32[1];
14895
14896 puDst->au32[4] = uSrc1.au32[4];
14897 puDst->au32[5] = uSrc2.au32[4];
14898 puDst->au32[6] = uSrc1.au32[5];
14899 puDst->au32[7] = uSrc2.au32[5];
14900}
14901
14902
14903/*
14904 * UNPCKLPD / VUNPCKLPD
14905 */
14906#ifdef IEM_WITHOUT_ASSEMBLY
14907IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14908{
14909 RTUINT128U uSrc1 = *puDst;
14910 RTUINT128U uSrc2 = *puSrc;
14911 ASMCompilerBarrier();
14912 puDst->au64[0] = uSrc1.au64[0];
14913 puDst->au64[1] = uSrc2.au64[0];
14914}
14915
14916#endif
14917
14918IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14919{
14920 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14921 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14922 ASMCompilerBarrier();
14923 puDst->au64[0] = uSrc1.au64[0];
14924 puDst->au64[1] = uSrc2.au64[0];
14925}
14926
14927
14928IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14929{
14930 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14931 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14932 ASMCompilerBarrier();
14933 puDst->au64[0] = uSrc1.au64[0];
14934 puDst->au64[1] = uSrc2.au64[0];
14935 puDst->au64[2] = uSrc1.au64[2];
14936 puDst->au64[3] = uSrc2.au64[2];
14937}
14938
14939
14940/*
14941 * UNPCKHPS / VUNPCKHPS
14942 */
14943#ifdef IEM_WITHOUT_ASSEMBLY
14944IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14945{
14946 RTUINT128U uSrc1 = *puDst;
14947 RTUINT128U uSrc2 = *puSrc;
14948 ASMCompilerBarrier();
14949 puDst->au32[0] = uSrc1.au32[2];
14950 puDst->au32[1] = uSrc2.au32[2];
14951 puDst->au32[2] = uSrc1.au32[3];
14952 puDst->au32[3] = uSrc2.au32[3];
14953}
14954
14955#endif
14956
14957IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14958{
14959 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14960 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14961 ASMCompilerBarrier();
14962 puDst->au32[0] = uSrc1.au32[2];
14963 puDst->au32[1] = uSrc2.au32[2];
14964 puDst->au32[2] = uSrc1.au32[3];
14965 puDst->au32[3] = uSrc2.au32[3];
14966}
14967
14968
14969IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14970{
14971 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14972 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14973 ASMCompilerBarrier();
14974 puDst->au32[0] = uSrc1.au32[2];
14975 puDst->au32[1] = uSrc2.au32[2];
14976 puDst->au32[2] = uSrc1.au32[3];
14977 puDst->au32[3] = uSrc2.au32[3];
14978
14979 puDst->au32[4] = uSrc1.au32[6];
14980 puDst->au32[5] = uSrc2.au32[6];
14981 puDst->au32[6] = uSrc1.au32[7];
14982 puDst->au32[7] = uSrc2.au32[7];
14983}
14984
14985
14986/*
14987 * UNPCKHPD / VUNPCKHPD
14988 */
14989#ifdef IEM_WITHOUT_ASSEMBLY
14990IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14991{
14992 RTUINT128U uSrc1 = *puDst;
14993 RTUINT128U uSrc2 = *puSrc;
14994 ASMCompilerBarrier();
14995 puDst->au64[0] = uSrc1.au64[1];
14996 puDst->au64[1] = uSrc2.au64[1];
14997}
14998
14999#endif
15000
15001IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15002{
15003 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15004 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15005 ASMCompilerBarrier();
15006 puDst->au64[0] = uSrc1.au64[1];
15007 puDst->au64[1] = uSrc2.au64[1];
15008}
15009
15010
15011IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15012{
15013 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15014 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15015 ASMCompilerBarrier();
15016 puDst->au64[0] = uSrc1.au64[1];
15017 puDst->au64[1] = uSrc2.au64[1];
15018 puDst->au64[2] = uSrc1.au64[3];
15019 puDst->au64[3] = uSrc2.au64[3];
15020}
15021
15022
15023/*
15024 * CRC32 (SEE 4.2).
15025 */
15026
15027IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
15028{
15029 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15030}
15031
15032
15033IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
15034{
15035 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15036}
15037
15038IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
15039{
15040 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15041}
15042
15043IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
15044{
15045 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15046}
15047
15048
15049/*
15050 * PTEST (SSE 4.1) - special as it output only EFLAGS.
15051 */
15052#ifdef IEM_WITHOUT_ASSEMBLY
15053IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15054{
15055 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15056 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15057 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15058 fEfl |= X86_EFL_ZF;
15059 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15060 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15061 fEfl |= X86_EFL_CF;
15062 *pfEFlags = fEfl;
15063}
15064#endif
15065
15066IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15067{
15068 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15069 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15070 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
15071 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
15072 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15073 fEfl |= X86_EFL_ZF;
15074 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15075 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
15076 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
15077 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15078 fEfl |= X86_EFL_CF;
15079 *pfEFlags = fEfl;
15080}
15081
15082
15083/* Worker for VEX.128 vtestp[s|d]. */
15084static void iemAImpl_vtestp_sd_u128_worker(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint64_t fSignMask, uint32_t *pfEFlags)
15085{
15086 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15087 RTUINT128U uTemp;
15088 uTemp.au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
15089 uTemp.au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
15090 if ((( uTemp.au64[0]
15091 | uTemp.au64[1]) & fSignMask) == 0)
15092 fEfl |= X86_EFL_ZF;
15093 uTemp.au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
15094 uTemp.au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
15095 if ((( uTemp.au64[0]
15096 | uTemp.au64[1]) & fSignMask) == 0)
15097 fEfl |= X86_EFL_CF;
15098 *pfEFlags = fEfl;
15099}
15100
15101
15102/* Worker for VEX.256 vtestp[s|d]. */
15103static void iemAImpl_vtestp_sd_u256_worker(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint64_t fSignMask, uint32_t *pfEFlags)
15104{
15105 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15106 RTUINT256U uTemp;
15107 uTemp.au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
15108 uTemp.au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
15109 uTemp.au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
15110 uTemp.au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
15111 if ((( uTemp.au64[0]
15112 | uTemp.au64[1]
15113 | uTemp.au64[2]
15114 | uTemp.au64[3]) & fSignMask) == 0)
15115 fEfl |= X86_EFL_ZF;
15116 uTemp.au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
15117 uTemp.au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
15118 uTemp.au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
15119 uTemp.au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
15120 if ((( uTemp.au64[0]
15121 | uTemp.au64[1]
15122 | uTemp.au64[2]
15123 | uTemp.au64[3]) & fSignMask) == 0)
15124 fEfl |= X86_EFL_CF;
15125 *pfEFlags = fEfl;
15126}
15127
15128
15129/*
15130 * VTESTPS
15131 */
15132IEM_DECL_IMPL_DEF(void, iemAImpl_vtestps_u128_fallback,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15133{
15134 uint64_t const fSignMask = RT_BIT_64(63) | RT_BIT_64(31);
15135 return iemAImpl_vtestp_sd_u128_worker(puSrc1, puSrc2, fSignMask, pfEFlags);
15136}
15137
15138
15139IEM_DECL_IMPL_DEF(void, iemAImpl_vtestps_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15140{
15141 uint64_t const fSignMask = RT_BIT_64(63) | RT_BIT_64(31);
15142 return iemAImpl_vtestp_sd_u256_worker(puSrc1, puSrc2, fSignMask, pfEFlags);
15143}
15144
15145
15146/*
15147 * VTESTPD
15148 */
15149IEM_DECL_IMPL_DEF(void, iemAImpl_vtestpd_u128_fallback,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15150{
15151 uint64_t const fSignMask = RT_BIT_64(63);
15152 return iemAImpl_vtestp_sd_u128_worker(puSrc1, puSrc2, fSignMask, pfEFlags);
15153}
15154
15155
15156IEM_DECL_IMPL_DEF(void, iemAImpl_vtestpd_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15157{
15158 uint64_t const fSignMask = RT_BIT_64(63);
15159 return iemAImpl_vtestp_sd_u256_worker(puSrc1, puSrc2, fSignMask, pfEFlags);
15160}
15161
15162
15163/*
15164 * PMOVSXBW / VPMOVSXBW
15165 */
15166IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15167{
15168 RTUINT64U uSrc1 = { uSrc };
15169 puDst->ai16[0] = uSrc1.ai8[0];
15170 puDst->ai16[1] = uSrc1.ai8[1];
15171 puDst->ai16[2] = uSrc1.ai8[2];
15172 puDst->ai16[3] = uSrc1.ai8[3];
15173 puDst->ai16[4] = uSrc1.ai8[4];
15174 puDst->ai16[5] = uSrc1.ai8[5];
15175 puDst->ai16[6] = uSrc1.ai8[6];
15176 puDst->ai16[7] = uSrc1.ai8[7];
15177}
15178
15179
15180IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15181{
15182 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15183 puDst->ai16[ 0] = uSrc1.ai8[ 0];
15184 puDst->ai16[ 1] = uSrc1.ai8[ 1];
15185 puDst->ai16[ 2] = uSrc1.ai8[ 2];
15186 puDst->ai16[ 3] = uSrc1.ai8[ 3];
15187 puDst->ai16[ 4] = uSrc1.ai8[ 4];
15188 puDst->ai16[ 5] = uSrc1.ai8[ 5];
15189 puDst->ai16[ 6] = uSrc1.ai8[ 6];
15190 puDst->ai16[ 7] = uSrc1.ai8[ 7];
15191 puDst->ai16[ 8] = uSrc1.ai8[ 8];
15192 puDst->ai16[ 9] = uSrc1.ai8[ 9];
15193 puDst->ai16[10] = uSrc1.ai8[10];
15194 puDst->ai16[11] = uSrc1.ai8[11];
15195 puDst->ai16[12] = uSrc1.ai8[12];
15196 puDst->ai16[13] = uSrc1.ai8[13];
15197 puDst->ai16[14] = uSrc1.ai8[14];
15198 puDst->ai16[15] = uSrc1.ai8[15];
15199}
15200
15201
15202/*
15203 * PMOVSXBD / VPMOVSXBD
15204 */
15205IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15206{
15207 RTUINT32U uSrc1 = { uSrc };
15208 puDst->ai32[0] = uSrc1.ai8[0];
15209 puDst->ai32[1] = uSrc1.ai8[1];
15210 puDst->ai32[2] = uSrc1.ai8[2];
15211 puDst->ai32[3] = uSrc1.ai8[3];
15212}
15213
15214
15215IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15216{
15217 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15218 puDst->ai32[0] = uSrc1.ai8[0];
15219 puDst->ai32[1] = uSrc1.ai8[1];
15220 puDst->ai32[2] = uSrc1.ai8[2];
15221 puDst->ai32[3] = uSrc1.ai8[3];
15222 puDst->ai32[4] = uSrc1.ai8[4];
15223 puDst->ai32[5] = uSrc1.ai8[5];
15224 puDst->ai32[6] = uSrc1.ai8[6];
15225 puDst->ai32[7] = uSrc1.ai8[7];
15226}
15227
15228
15229/*
15230 * PMOVSXBQ / VPMOVSXBQ
15231 */
15232IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15233{
15234 RTUINT16U uSrc1 = { uSrc };
15235 puDst->ai64[0] = uSrc1.ai8[0];
15236 puDst->ai64[1] = uSrc1.ai8[1];
15237}
15238
15239
15240IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15241{
15242 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15243 puDst->ai64[0] = uSrc1.ai8[0];
15244 puDst->ai64[1] = uSrc1.ai8[1];
15245 puDst->ai64[2] = uSrc1.ai8[2];
15246 puDst->ai64[3] = uSrc1.ai8[3];
15247}
15248
15249
15250/*
15251 * PMOVSXWD / VPMOVSXWD
15252 */
15253IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15254{
15255 RTUINT64U uSrc1 = { uSrc };
15256 puDst->ai32[0] = uSrc1.ai16[0];
15257 puDst->ai32[1] = uSrc1.ai16[1];
15258 puDst->ai32[2] = uSrc1.ai16[2];
15259 puDst->ai32[3] = uSrc1.ai16[3];
15260}
15261
15262
15263IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15264{
15265 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15266 puDst->ai32[0] = uSrc1.ai16[0];
15267 puDst->ai32[1] = uSrc1.ai16[1];
15268 puDst->ai32[2] = uSrc1.ai16[2];
15269 puDst->ai32[3] = uSrc1.ai16[3];
15270 puDst->ai32[4] = uSrc1.ai16[4];
15271 puDst->ai32[5] = uSrc1.ai16[5];
15272 puDst->ai32[6] = uSrc1.ai16[6];
15273 puDst->ai32[7] = uSrc1.ai16[7];
15274}
15275
15276
15277/*
15278 * PMOVSXWQ / VPMOVSXWQ
15279 */
15280IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15281{
15282 RTUINT32U uSrc1 = { uSrc };
15283 puDst->ai64[0] = uSrc1.ai16[0];
15284 puDst->ai64[1] = uSrc1.ai16[1];
15285}
15286
15287
15288IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15289{
15290 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15291 puDst->ai64[0] = uSrc1.ai16[0];
15292 puDst->ai64[1] = uSrc1.ai16[1];
15293 puDst->ai64[2] = uSrc1.ai16[2];
15294 puDst->ai64[3] = uSrc1.ai16[3];
15295}
15296
15297
15298/*
15299 * PMOVSXDQ / VPMOVSXDQ
15300 */
15301IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15302{
15303 RTUINT64U uSrc1 = { uSrc };
15304 puDst->ai64[0] = uSrc1.ai32[0];
15305 puDst->ai64[1] = uSrc1.ai32[1];
15306}
15307
15308
15309IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15310{
15311 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15312 puDst->ai64[0] = uSrc1.ai32[0];
15313 puDst->ai64[1] = uSrc1.ai32[1];
15314 puDst->ai64[2] = uSrc1.ai32[2];
15315 puDst->ai64[3] = uSrc1.ai32[3];
15316}
15317
15318
15319/*
15320 * PMOVZXBW / VPMOVZXBW
15321 */
15322IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15323{
15324 RTUINT64U uSrc1 = { uSrc };
15325 puDst->au16[0] = uSrc1.au8[0];
15326 puDst->au16[1] = uSrc1.au8[1];
15327 puDst->au16[2] = uSrc1.au8[2];
15328 puDst->au16[3] = uSrc1.au8[3];
15329 puDst->au16[4] = uSrc1.au8[4];
15330 puDst->au16[5] = uSrc1.au8[5];
15331 puDst->au16[6] = uSrc1.au8[6];
15332 puDst->au16[7] = uSrc1.au8[7];
15333}
15334
15335
15336IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15337{
15338 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15339 puDst->au16[ 0] = uSrc1.au8[ 0];
15340 puDst->au16[ 1] = uSrc1.au8[ 1];
15341 puDst->au16[ 2] = uSrc1.au8[ 2];
15342 puDst->au16[ 3] = uSrc1.au8[ 3];
15343 puDst->au16[ 4] = uSrc1.au8[ 4];
15344 puDst->au16[ 5] = uSrc1.au8[ 5];
15345 puDst->au16[ 6] = uSrc1.au8[ 6];
15346 puDst->au16[ 7] = uSrc1.au8[ 7];
15347 puDst->au16[ 8] = uSrc1.au8[ 8];
15348 puDst->au16[ 9] = uSrc1.au8[ 9];
15349 puDst->au16[10] = uSrc1.au8[10];
15350 puDst->au16[11] = uSrc1.au8[11];
15351 puDst->au16[12] = uSrc1.au8[12];
15352 puDst->au16[13] = uSrc1.au8[13];
15353 puDst->au16[14] = uSrc1.au8[14];
15354 puDst->au16[15] = uSrc1.au8[15];
15355}
15356
15357
15358/*
15359 * PMOVZXBD / VPMOVZXBD
15360 */
15361IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15362{
15363 RTUINT32U uSrc1 = { uSrc };
15364 puDst->au32[0] = uSrc1.au8[0];
15365 puDst->au32[1] = uSrc1.au8[1];
15366 puDst->au32[2] = uSrc1.au8[2];
15367 puDst->au32[3] = uSrc1.au8[3];
15368}
15369
15370
15371IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15372{
15373 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15374 puDst->au32[0] = uSrc1.au8[0];
15375 puDst->au32[1] = uSrc1.au8[1];
15376 puDst->au32[2] = uSrc1.au8[2];
15377 puDst->au32[3] = uSrc1.au8[3];
15378 puDst->au32[4] = uSrc1.au8[4];
15379 puDst->au32[5] = uSrc1.au8[5];
15380 puDst->au32[6] = uSrc1.au8[6];
15381 puDst->au32[7] = uSrc1.au8[7];
15382}
15383
15384
15385/*
15386 * PMOVZXBQ / VPMOVZXBQ
15387 */
15388IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15389{
15390 RTUINT16U uSrc1 = { uSrc };
15391 puDst->au64[0] = uSrc1.au8[0];
15392 puDst->au64[1] = uSrc1.au8[1];
15393}
15394
15395
15396IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15397{
15398 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15399 puDst->au64[0] = uSrc1.au8[0];
15400 puDst->au64[1] = uSrc1.au8[1];
15401 puDst->au64[2] = uSrc1.au8[2];
15402 puDst->au64[3] = uSrc1.au8[3];
15403}
15404
15405
15406/*
15407 * PMOVZXWD / VPMOVZXWD
15408 */
15409IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15410{
15411 RTUINT64U uSrc1 = { uSrc };
15412 puDst->au32[0] = uSrc1.au16[0];
15413 puDst->au32[1] = uSrc1.au16[1];
15414 puDst->au32[2] = uSrc1.au16[2];
15415 puDst->au32[3] = uSrc1.au16[3];
15416}
15417
15418
15419IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15420{
15421 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15422 puDst->au32[0] = uSrc1.au16[0];
15423 puDst->au32[1] = uSrc1.au16[1];
15424 puDst->au32[2] = uSrc1.au16[2];
15425 puDst->au32[3] = uSrc1.au16[3];
15426 puDst->au32[4] = uSrc1.au16[4];
15427 puDst->au32[5] = uSrc1.au16[5];
15428 puDst->au32[6] = uSrc1.au16[6];
15429 puDst->au32[7] = uSrc1.au16[7];
15430}
15431
15432
15433/*
15434 * PMOVZXWQ / VPMOVZXWQ
15435 */
15436IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15437{
15438 RTUINT32U uSrc1 = { uSrc };
15439 puDst->au64[0] = uSrc1.au16[0];
15440 puDst->au64[1] = uSrc1.au16[1];
15441}
15442
15443
15444IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15445{
15446 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15447 puDst->au64[0] = uSrc1.au16[0];
15448 puDst->au64[1] = uSrc1.au16[1];
15449 puDst->au64[2] = uSrc1.au16[2];
15450 puDst->au64[3] = uSrc1.au16[3];
15451}
15452
15453
15454/*
15455 * PMOVZXDQ / VPMOVZXDQ
15456 */
15457IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15458{
15459 RTUINT64U uSrc1 = { uSrc };
15460 puDst->au64[0] = uSrc1.au32[0];
15461 puDst->au64[1] = uSrc1.au32[1];
15462}
15463
15464
15465IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15466{
15467 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15468 puDst->au64[0] = uSrc1.au32[0];
15469 puDst->au64[1] = uSrc1.au32[1];
15470 puDst->au64[2] = uSrc1.au32[2];
15471 puDst->au64[3] = uSrc1.au32[3];
15472}
15473
15474/**
15475 * Converts from the packed IPRT 32-bit (single precision) floating point format to
15476 * the SoftFloat 32-bit floating point format (float32_t).
15477 *
15478 * This is only a structure format conversion, nothing else.
15479 */
15480DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
15481{
15482 float32_t Tmp;
15483 Tmp.v = pr32Val->u;
15484 return Tmp;
15485}
15486
15487
15488/**
15489 * Converts from SoftFloat 32-bit floating point format (float32_t)
15490 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
15491 *
15492 * This is only a structure format conversion, nothing else.
15493 */
15494DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
15495{
15496 pr32Dst->u = r32XSrc.v;
15497 return pr32Dst;
15498}
15499
15500
15501/**
15502 * Converts from the packed IPRT 64-bit (single precision) floating point format to
15503 * the SoftFloat 64-bit floating point format (float64_t).
15504 *
15505 * This is only a structure format conversion, nothing else.
15506 */
15507DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
15508{
15509 float64_t Tmp;
15510 Tmp.v = pr64Val->u;
15511 return Tmp;
15512}
15513
15514
15515/**
15516 * Converts from SoftFloat 64-bit floating point format (float64_t)
15517 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
15518 *
15519 * This is only a structure format conversion, nothing else.
15520 */
15521DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
15522{
15523 pr64Dst->u = r64XSrc.v;
15524 return pr64Dst;
15525}
15526
15527
15528/** Initializer for the SoftFloat state structure. */
15529# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
15530 { \
15531 softfloat_tininess_afterRounding, \
15532 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
15533 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
15534 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
15535 : (uint8_t)softfloat_round_minMag, \
15536 0, \
15537 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
15538 32 /* Rounding precision, not relevant for SIMD. */ \
15539 }
15540
15541#ifdef IEM_WITHOUT_ASSEMBLY
15542
15543/**
15544 * Helper for transfering exception to MXCSR and setting the result value
15545 * accordingly.
15546 *
15547 * @returns Updated MXCSR.
15548 * @param pSoftState The SoftFloat state following the operation.
15549 * @param r32Result The result of the SoftFloat operation.
15550 * @param pr32Result Where to store the result for IEM.
15551 * @param fMxcsr The original MXCSR value.
15552 */
15553DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
15554 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15555{
15556 iemFpSoftF32ToIprt(pr32Result, r32Result);
15557
15558 uint8_t fXcpt = pSoftState->exceptionFlags;
15559 if ( (fMxcsr & X86_MXCSR_FZ)
15560 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
15561 {
15562 /* Underflow masked and flush to zero is set. */
15563 pr32Result->s.uFraction = 0;
15564 pr32Result->s.uExponent = 0;
15565 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15566 }
15567
15568 /* If DAZ is set \#DE is never set. */
15569 if ( fMxcsr & X86_MXCSR_DAZ
15570 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15571 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15572 fXcpt &= ~X86_MXCSR_DE;
15573
15574 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15575}
15576
15577
15578/**
15579 * Helper for transfering exception to MXCSR and setting the result value
15580 * accordingly - ignores Flush-to-Zero.
15581 *
15582 * @returns Updated MXCSR.
15583 * @param pSoftState The SoftFloat state following the operation.
15584 * @param r32Result The result of the SoftFloat operation.
15585 * @param pr32Result Where to store the result for IEM.
15586 * @param fMxcsr The original MXCSR value.
15587 */
15588DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15589 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15590{
15591 iemFpSoftF32ToIprt(pr32Result, r32Result);
15592
15593 uint8_t fXcpt = pSoftState->exceptionFlags;
15594 /* If DAZ is set \#DE is never set. */
15595 if ( fMxcsr & X86_MXCSR_DAZ
15596 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15597 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15598 fXcpt &= ~X86_MXCSR_DE;
15599
15600 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15601}
15602
15603
15604/**
15605 * Helper for transfering exception to MXCSR and setting the result value
15606 * accordingly.
15607 *
15608 * @returns Updated MXCSR.
15609 * @param pSoftState The SoftFloat state following the operation.
15610 * @param r64Result The result of the SoftFloat operation.
15611 * @param pr64Result Where to store the result for IEM.
15612 * @param fMxcsr The original MXCSR value.
15613 */
15614DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15615 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15616{
15617 iemFpSoftF64ToIprt(pr64Result, r64Result);
15618 uint8_t fXcpt = pSoftState->exceptionFlags;
15619 if ( (fMxcsr & X86_MXCSR_FZ)
15620 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15621 {
15622 /* Underflow masked and flush to zero is set. */
15623 iemFpSoftF64ToIprt(pr64Result, r64Result);
15624 pr64Result->s.uFractionHigh = 0;
15625 pr64Result->s.uFractionLow = 0;
15626 pr64Result->s.uExponent = 0;
15627 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15628 }
15629
15630 /* If DAZ is set \#DE is never set. */
15631 if ( fMxcsr & X86_MXCSR_DAZ
15632 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15633 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15634 fXcpt &= ~X86_MXCSR_DE;
15635
15636 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15637}
15638
15639
15640/**
15641 * Helper for transfering exception to MXCSR and setting the result value
15642 * accordingly - ignores Flush-to-Zero.
15643 *
15644 * @returns Updated MXCSR.
15645 * @param pSoftState The SoftFloat state following the operation.
15646 * @param r64Result The result of the SoftFloat operation.
15647 * @param pr64Result Where to store the result for IEM.
15648 * @param fMxcsr The original MXCSR value.
15649 */
15650DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15651 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15652{
15653 iemFpSoftF64ToIprt(pr64Result, r64Result);
15654
15655 uint8_t fXcpt = pSoftState->exceptionFlags;
15656 /* If DAZ is set \#DE is never set. */
15657 if ( fMxcsr & X86_MXCSR_DAZ
15658 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15659 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15660 fXcpt &= ~X86_MXCSR_DE;
15661
15662 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15663}
15664
15665#endif /* IEM_WITHOUT_ASSEMBLY */
15666
15667
15668/**
15669 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15670 * in MXCSR into account.
15671 *
15672 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15673 * @param pr32Val Where to store the result.
15674 * @param fMxcsr The input MXCSR value.
15675 * @param pr32Src The value to use.
15676 */
15677DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15678{
15679 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15680 {
15681 if (fMxcsr & X86_MXCSR_DAZ)
15682 {
15683 /* De-normals are changed to 0. */
15684 pr32Val->s.fSign = pr32Src->s.fSign;
15685 pr32Val->s.uFraction = 0;
15686 pr32Val->s.uExponent = 0;
15687 return 0;
15688 }
15689
15690 *pr32Val = *pr32Src;
15691 return X86_MXCSR_DE;
15692 }
15693
15694 *pr32Val = *pr32Src;
15695 return 0;
15696}
15697
15698
15699/**
15700 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15701 * in MXCSR into account.
15702 *
15703 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15704 * @param pr64Val Where to store the result.
15705 * @param fMxcsr The input MXCSR value.
15706 * @param pr64Src The value to use.
15707 */
15708DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15709{
15710 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15711 {
15712 if (fMxcsr & X86_MXCSR_DAZ)
15713 {
15714 /* De-normals are changed to 0. */
15715 pr64Val->s64.fSign = pr64Src->s.fSign;
15716 pr64Val->s64.uFraction = 0;
15717 pr64Val->s64.uExponent = 0;
15718 return 0;
15719 }
15720
15721 *pr64Val = *pr64Src;
15722 return X86_MXCSR_DE;
15723 }
15724
15725 *pr64Val = *pr64Src;
15726 return 0;
15727}
15728
15729#ifdef IEM_WITHOUT_ASSEMBLY
15730
15731/**
15732 * Validates the given input operands returning whether the operation can continue or whether one
15733 * of the source operands contains a NaN value, setting the output accordingly.
15734 *
15735 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15736 * @param pr32Res Where to store the result in case the operation can't continue.
15737 * @param pr32Val1 The first input operand.
15738 * @param pr32Val2 The second input operand.
15739 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15740 */
15741DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15742{
15743 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15744 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15745 if (cSNan + cQNan == 2)
15746 {
15747 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15748 *pr32Res = *pr32Val1;
15749 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15750 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15751 return true;
15752 }
15753 if (cSNan)
15754 {
15755 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15756 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15757 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15758 *pfMxcsr |= X86_MXCSR_IE;
15759 return true;
15760 }
15761 if (cQNan)
15762 {
15763 /* The QNan operand is placed into the result. */
15764 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15765 return true;
15766 }
15767
15768 Assert(!cQNan && !cSNan);
15769 return false;
15770}
15771
15772
15773/**
15774 * Validates the given double precision input operands returning whether the operation can continue or whether one
15775 * of the source operands contains a NaN value, setting the output accordingly.
15776 *
15777 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15778 * @param pr64Res Where to store the result in case the operation can't continue.
15779 * @param pr64Val1 The first input operand.
15780 * @param pr64Val2 The second input operand.
15781 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15782 */
15783DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15784{
15785 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15786 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15787 if (cSNan + cQNan == 2)
15788 {
15789 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15790 *pr64Res = *pr64Val1;
15791 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15792 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15793 return true;
15794 }
15795 if (cSNan)
15796 {
15797 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15798 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15799 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15800 *pfMxcsr |= X86_MXCSR_IE;
15801 return true;
15802 }
15803 if (cQNan)
15804 {
15805 /* The QNan operand is placed into the result. */
15806 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15807 return true;
15808 }
15809
15810 Assert(!cQNan && !cSNan);
15811 return false;
15812}
15813
15814
15815/**
15816 * Validates the given single input operand returning whether the operation can continue or whether
15817 * contains a NaN value, setting the output accordingly.
15818 *
15819 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15820 * @param pr32Res Where to store the result in case the operation can't continue.
15821 * @param pr32Val The input operand.
15822 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15823 */
15824DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15825{
15826 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15827 {
15828 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15829 *pr32Res = *pr32Val;
15830 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15831 *pfMxcsr |= X86_MXCSR_IE;
15832 return true;
15833 }
15834 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15835 {
15836 /* The QNan operand is placed into the result. */
15837 *pr32Res = *pr32Val;
15838 return true;
15839 }
15840
15841 return false;
15842}
15843
15844
15845/**
15846 * Validates the given double input operand returning whether the operation can continue or whether
15847 * contains a NaN value, setting the output accordingly.
15848 *
15849 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15850 * @param pr64Res Where to store the result in case the operation can't continue.
15851 * @param pr64Val The input operand.
15852 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15853 */
15854DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15855{
15856 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15857 {
15858 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15859 *pr64Res = *pr64Val;
15860 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15861 *pfMxcsr |= X86_MXCSR_IE;
15862 return true;
15863 }
15864 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15865 {
15866 /* The QNan operand is placed into the result. */
15867 *pr64Res = *pr64Val;
15868 return true;
15869 }
15870
15871 return false;
15872}
15873
15874#endif /* IEM_WITHOUT_ASSEMBLY */
15875
15876/**
15877 * ADDPS
15878 */
15879#ifdef IEM_WITHOUT_ASSEMBLY
15880static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15881{
15882 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15883 return fMxcsr;
15884
15885 RTFLOAT32U r32Src1, r32Src2;
15886 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15887 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15888 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15889 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15890 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15891}
15892
15893
15894IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15895{
15896 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15897 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15898 | iemAImpl_addps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15899 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15900}
15901#endif
15902
15903
15904/**
15905 * ADDSS
15906 */
15907#ifdef IEM_WITHOUT_ASSEMBLY
15908IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15909{
15910 pResult->ar32[1] = puSrc1->ar32[1];
15911 pResult->ar32[2] = puSrc1->ar32[2];
15912 pResult->ar32[3] = puSrc1->ar32[3];
15913 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15914}
15915#endif
15916
15917
15918/**
15919 * ADDPD
15920 */
15921#ifdef IEM_WITHOUT_ASSEMBLY
15922static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15923{
15924 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15925 return fMxcsr;
15926
15927 RTFLOAT64U r64Src1, r64Src2;
15928 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15929 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15930 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15931 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15932 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15933}
15934
15935
15936IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15937{
15938 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
15939 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15940}
15941#endif
15942
15943
15944/**
15945 * ADDSD
15946 */
15947#ifdef IEM_WITHOUT_ASSEMBLY
15948IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15949{
15950 pResult->ar64[1] = puSrc1->ar64[1];
15951 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
15952}
15953#endif
15954
15955
15956/**
15957 * MULPS
15958 */
15959#ifdef IEM_WITHOUT_ASSEMBLY
15960static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15961{
15962 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15963 return fMxcsr;
15964
15965 RTFLOAT32U r32Src1, r32Src2;
15966 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15967 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15968 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15969 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15970 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15971}
15972
15973
15974IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15975{
15976 return iemAImpl_mulps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15977 | iemAImpl_mulps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15978 | iemAImpl_mulps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15979 | iemAImpl_mulps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15980}
15981#endif
15982
15983
15984/**
15985 * MULSS
15986 */
15987#ifdef IEM_WITHOUT_ASSEMBLY
15988IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15989{
15990 pResult->ar32[1] = puSrc1->ar32[1];
15991 pResult->ar32[2] = puSrc1->ar32[2];
15992 pResult->ar32[3] = puSrc1->ar32[3];
15993 return iemAImpl_mulps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15994}
15995#endif
15996
15997
15998/**
15999 * MULPD
16000 */
16001#ifdef IEM_WITHOUT_ASSEMBLY
16002static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16003{
16004 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16005 return fMxcsr;
16006
16007 RTFLOAT64U r64Src1, r64Src2;
16008 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16009 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16010 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16011 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16012 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16013}
16014
16015
16016IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16017{
16018 return iemAImpl_mulpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16019 | iemAImpl_mulpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16020}
16021#endif
16022
16023
16024/**
16025 * MULSD
16026 */
16027#ifdef IEM_WITHOUT_ASSEMBLY
16028IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16029{
16030 pResult->ar64[1] = puSrc1->ar64[1];
16031 return iemAImpl_mulpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16032}
16033#endif
16034
16035
16036/**
16037 * SUBPS
16038 */
16039#ifdef IEM_WITHOUT_ASSEMBLY
16040static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16041{
16042 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16043 return fMxcsr;
16044
16045 RTFLOAT32U r32Src1, r32Src2;
16046 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16047 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16048 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16049 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16050 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16051}
16052
16053
16054IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16055{
16056 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16057 | iemAImpl_subps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16058 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16059 | iemAImpl_subps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16060}
16061#endif
16062
16063
16064/**
16065 * SUBSS
16066 */
16067#ifdef IEM_WITHOUT_ASSEMBLY
16068IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16069{
16070 pResult->ar32[1] = puSrc1->ar32[1];
16071 pResult->ar32[2] = puSrc1->ar32[2];
16072 pResult->ar32[3] = puSrc1->ar32[3];
16073 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16074}
16075#endif
16076
16077
16078/**
16079 * SUBPD
16080 */
16081#ifdef IEM_WITHOUT_ASSEMBLY
16082static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16083{
16084 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16085 return fMxcsr;
16086
16087 RTFLOAT64U r64Src1, r64Src2;
16088 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16089 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16090 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16091 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16092 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16093}
16094
16095
16096IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16097{
16098 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16099 | iemAImpl_subpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16100}
16101#endif
16102
16103
16104/**
16105 * SUBSD
16106 */
16107#ifdef IEM_WITHOUT_ASSEMBLY
16108IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16109{
16110 pResult->ar64[1] = puSrc1->ar64[1];
16111 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16112}
16113#endif
16114
16115
16116/**
16117 * MINPS
16118 */
16119#ifdef IEM_WITHOUT_ASSEMBLY
16120static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16121{
16122 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16123 {
16124 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16125 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16126 return fMxcsr | X86_MXCSR_IE;
16127 }
16128
16129 RTFLOAT32U r32Src1, r32Src2;
16130 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16131 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16132 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16133 {
16134 *pr32Res = r32Src2;
16135 return fMxcsr;
16136 }
16137
16138 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16139 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16140 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16141 fLe
16142 ? iemFpSoftF32FromIprt(&r32Src1)
16143 : iemFpSoftF32FromIprt(&r32Src2),
16144 pr32Res, fMxcsr);
16145}
16146
16147
16148IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16149{
16150 return iemAImpl_minps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16151 | iemAImpl_minps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16152 | iemAImpl_minps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16153 | iemAImpl_minps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16154}
16155#endif
16156
16157
16158/**
16159 * MINSS
16160 */
16161#ifdef IEM_WITHOUT_ASSEMBLY
16162IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16163{
16164 pResult->ar32[1] = puSrc1->ar32[1];
16165 pResult->ar32[2] = puSrc1->ar32[2];
16166 pResult->ar32[3] = puSrc1->ar32[3];
16167 return iemAImpl_minps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16168}
16169#endif
16170
16171
16172/**
16173 * MINPD
16174 */
16175#ifdef IEM_WITHOUT_ASSEMBLY
16176static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16177{
16178 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16179 {
16180 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16181 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16182 return fMxcsr | X86_MXCSR_IE;
16183 }
16184
16185 RTFLOAT64U r64Src1, r64Src2;
16186 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16187 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16188 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16189 {
16190 *pr64Res = r64Src2;
16191 return fMxcsr;
16192 }
16193
16194 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16195 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16196 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16197 fLe
16198 ? iemFpSoftF64FromIprt(&r64Src1)
16199 : iemFpSoftF64FromIprt(&r64Src2),
16200 pr64Res, fMxcsr);
16201}
16202
16203
16204IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16205{
16206 return iemAImpl_minpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16207 | iemAImpl_minpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16208}
16209#endif
16210
16211
16212/**
16213 * MINSD
16214 */
16215#ifdef IEM_WITHOUT_ASSEMBLY
16216IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16217{
16218 pResult->ar64[1] = puSrc1->ar64[1];
16219 return iemAImpl_minpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16220}
16221#endif
16222
16223
16224/**
16225 * DIVPS
16226 */
16227#ifdef IEM_WITHOUT_ASSEMBLY
16228static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16229{
16230 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16231 return fMxcsr;
16232
16233 RTFLOAT32U r32Src1, r32Src2;
16234 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16235 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16236 if (RTFLOAT32U_IS_ZERO(&r32Src2))
16237 {
16238 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
16239 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
16240 {
16241 *pr32Res = g_ar32QNaN[1];
16242 return fMxcsr | X86_MXCSR_IE;
16243 }
16244 else if (RTFLOAT32U_IS_INF(&r32Src1))
16245 {
16246 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16247 return fMxcsr;
16248 }
16249 else
16250 {
16251 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16252 return fMxcsr | X86_MXCSR_ZE;
16253 }
16254 }
16255
16256 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16257 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16258 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16259}
16260
16261
16262IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16263{
16264 return iemAImpl_divps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16265 | iemAImpl_divps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16266 | iemAImpl_divps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16267 | iemAImpl_divps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16268}
16269#endif
16270
16271
16272/**
16273 * DIVSS
16274 */
16275#ifdef IEM_WITHOUT_ASSEMBLY
16276IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16277{
16278 pResult->ar32[1] = puSrc1->ar32[1];
16279 pResult->ar32[2] = puSrc1->ar32[2];
16280 pResult->ar32[3] = puSrc1->ar32[3];
16281 return iemAImpl_divps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16282}
16283#endif
16284
16285
16286/**
16287 * DIVPD
16288 */
16289#ifdef IEM_WITHOUT_ASSEMBLY
16290static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16291{
16292 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16293 return fMxcsr;
16294
16295 RTFLOAT64U r64Src1, r64Src2;
16296 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16297 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16298 if (RTFLOAT64U_IS_ZERO(&r64Src2))
16299 {
16300 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
16301 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
16302 {
16303 *pr64Res = g_ar64QNaN[1];
16304 return fMxcsr | X86_MXCSR_IE;
16305 }
16306 else if (RTFLOAT64U_IS_INF(&r64Src1))
16307 {
16308 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16309 return fMxcsr;
16310 }
16311 else
16312 {
16313 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16314 return fMxcsr | X86_MXCSR_ZE;
16315 }
16316 }
16317
16318 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16319 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16320 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16321}
16322
16323
16324IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16325{
16326 return iemAImpl_divpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16327 | iemAImpl_divpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16328}
16329#endif
16330
16331
16332/**
16333 * DIVSD
16334 */
16335#ifdef IEM_WITHOUT_ASSEMBLY
16336IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16337{
16338 pResult->ar64[1] = puSrc1->ar64[1];
16339 return iemAImpl_divpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16340}
16341#endif
16342
16343
16344/**
16345 * MAXPS
16346 */
16347#ifdef IEM_WITHOUT_ASSEMBLY
16348static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16349{
16350 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16351 {
16352 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16353 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16354 return fMxcsr | X86_MXCSR_IE;
16355 }
16356
16357 RTFLOAT32U r32Src1, r32Src2;
16358 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16359 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16360 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16361 {
16362 *pr32Res = r32Src2;
16363 return fMxcsr;
16364 }
16365
16366 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16367 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16368 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16369 fLe
16370 ? iemFpSoftF32FromIprt(&r32Src2)
16371 : iemFpSoftF32FromIprt(&r32Src1),
16372 pr32Res, fMxcsr);
16373}
16374
16375
16376IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16377{
16378 return iemAImpl_maxps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16379 | iemAImpl_maxps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16380 | iemAImpl_maxps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16381 | iemAImpl_maxps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16382}
16383#endif
16384
16385
16386/**
16387 * MAXSS
16388 */
16389#ifdef IEM_WITHOUT_ASSEMBLY
16390IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16391{
16392 pResult->ar32[1] = puSrc1->ar32[1];
16393 pResult->ar32[2] = puSrc1->ar32[2];
16394 pResult->ar32[3] = puSrc1->ar32[3];
16395 return iemAImpl_maxps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16396}
16397#endif
16398
16399
16400/**
16401 * MAXPD
16402 */
16403#ifdef IEM_WITHOUT_ASSEMBLY
16404static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16405{
16406 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16407 {
16408 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16409 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16410 return fMxcsr | X86_MXCSR_IE;
16411 }
16412
16413 RTFLOAT64U r64Src1, r64Src2;
16414 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16415 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16416 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16417 {
16418 *pr64Res = r64Src2;
16419 return fMxcsr;
16420 }
16421
16422 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16423 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16424 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16425 fLe
16426 ? iemFpSoftF64FromIprt(&r64Src2)
16427 : iemFpSoftF64FromIprt(&r64Src1),
16428 pr64Res, fMxcsr);
16429}
16430
16431
16432IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16433{
16434 return iemAImpl_maxpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16435 | iemAImpl_maxpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16436}
16437#endif
16438
16439
16440/**
16441 * MAXSD
16442 */
16443#ifdef IEM_WITHOUT_ASSEMBLY
16444IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16445{
16446 pResult->ar64[1] = puSrc1->ar64[1];
16447 return iemAImpl_maxpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16448}
16449#endif
16450
16451
16452/**
16453 * CVTSS2SD
16454 */
16455#ifdef IEM_WITHOUT_ASSEMBLY
16456static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16457{
16458 RTFLOAT32U r32Src1;
16459 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16460
16461 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16462 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16463 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16464}
16465
16466
16467IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2sd_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16468{
16469 pResult->ar64[1] = puSrc1->ar64[1];
16470 return iemAImpl_cvtss2sd_u128_r32_worker(&pResult->ar64[0], uMxCsrIn, pr32Src2);
16471}
16472#endif
16473
16474
16475/**
16476 * CVTSD2SS
16477 */
16478#ifdef IEM_WITHOUT_ASSEMBLY
16479static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16480{
16481 RTFLOAT64U r64Src1;
16482 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16483
16484 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16485 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16486 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16487}
16488
16489
16490IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2ss_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16491{
16492 pResult->ar32[1] = puSrc1->ar32[1];
16493 pResult->ar32[2] = puSrc1->ar32[2];
16494 pResult->ar32[3] = puSrc1->ar32[3];
16495 return iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->ar32[0], uMxCsrIn, pr64Src2);
16496}
16497#endif
16498
16499
16500/**
16501 * HADDPS
16502 */
16503#ifdef IEM_WITHOUT_ASSEMBLY
16504IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_haddps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16505{
16506 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc1->ar32[1])
16507 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[2], &puSrc1->ar32[3])
16508 | iemAImpl_addps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[0], &puSrc2->ar32[1])
16509 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16510}
16511#endif
16512
16513
16514/**
16515 * HADDPD
16516 */
16517#ifdef IEM_WITHOUT_ASSEMBLY
16518IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_haddpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16519{
16520 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc1->ar64[1])
16521 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16522}
16523#endif
16524
16525
16526/**
16527 * HSUBPS
16528 */
16529#ifdef IEM_WITHOUT_ASSEMBLY
16530IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_hsubps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16531{
16532 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc1->ar32[1])
16533 | iemAImpl_subps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[2], &puSrc1->ar32[3])
16534 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[0], &puSrc2->ar32[1])
16535 | iemAImpl_subps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16536}
16537#endif
16538
16539
16540/**
16541 * HSUBPD
16542 */
16543#ifdef IEM_WITHOUT_ASSEMBLY
16544IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_hsubpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16545{
16546 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc1->ar64[1])
16547 | iemAImpl_subpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16548}
16549#endif
16550
16551
16552/**
16553 * SQRTPS
16554 */
16555#ifdef IEM_WITHOUT_ASSEMBLY
16556static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16557{
16558 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16559 return fMxcsr;
16560
16561 RTFLOAT32U r32Src;
16562 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
16563 if (RTFLOAT32U_IS_ZERO(&r32Src))
16564 {
16565 *pr32Res = r32Src;
16566 return fMxcsr;
16567 }
16568 else if (r32Src.s.fSign)
16569 {
16570 *pr32Res = g_ar32QNaN[1];
16571 return fMxcsr | X86_MXCSR_IE;
16572 }
16573
16574 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16575 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16576 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16577}
16578
16579
16580IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16581{
16582 RT_NOREF(puSrc1);
16583
16584 return iemAImpl_sqrtps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16585 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16586 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16587 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16588}
16589#endif
16590
16591
16592/**
16593 * SQRTSS
16594 */
16595#ifdef IEM_WITHOUT_ASSEMBLY
16596IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16597{
16598 pResult->ar32[1] = puSrc1->ar32[1];
16599 pResult->ar32[2] = puSrc1->ar32[2];
16600 pResult->ar32[3] = puSrc1->ar32[3];
16601 return iemAImpl_sqrtps_u128_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16602}
16603#endif
16604
16605
16606/**
16607 * SQRTPD
16608 */
16609#ifdef IEM_WITHOUT_ASSEMBLY
16610static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16611{
16612 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16613 return fMxcsr;
16614
16615 RTFLOAT64U r64Src;
16616 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16617 if (RTFLOAT64U_IS_ZERO(&r64Src))
16618 {
16619 *pr64Res = r64Src;
16620 return fMxcsr;
16621 }
16622 else if (r64Src.s.fSign)
16623 {
16624 *pr64Res = g_ar64QNaN[1];
16625 return fMxcsr | X86_MXCSR_IE;
16626 }
16627
16628 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16629 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16630 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16631}
16632
16633
16634IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16635{
16636 RT_NOREF(puSrc1);
16637
16638 return iemAImpl_sqrtpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc2->ar64[0])
16639 | iemAImpl_sqrtpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[1]);
16640}
16641#endif
16642
16643
16644/**
16645 * SQRTSD
16646 */
16647#ifdef IEM_WITHOUT_ASSEMBLY
16648IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16649{
16650 pResult->ar64[1] = puSrc1->ar64[1];
16651 return iemAImpl_sqrtpd_u128_worker(&pResult->ar64[0], uMxCsrIn, pr64Src2);
16652}
16653#endif
16654
16655
16656#ifdef IEM_WITHOUT_ASSEMBLY
16657/**
16658 * RSQRTPS
16659 */
16660static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16661{
16662 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16663 return fMxcsr;
16664
16665 RTFLOAT32U r32Src;
16666 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16667 if (RTFLOAT32U_IS_ZERO(&r32Src))
16668 {
16669 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16670 return fMxcsr;
16671 }
16672 else if (r32Src.s.fSign)
16673 {
16674 *pr32Res = g_ar32QNaN[1];
16675 return fMxcsr | X86_MXCSR_IE;
16676 }
16677
16678 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16679 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16680 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16681}
16682
16683
16684IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rsqrtps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16685{
16686 RT_NOREF(puSrc1);
16687
16688 return iemAImpl_rsqrt_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16689 | iemAImpl_rsqrt_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16690 | iemAImpl_rsqrt_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16691 | iemAImpl_rsqrt_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16692}
16693
16694
16695/**
16696 * RSQRTSS
16697 */
16698IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rsqrtss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16699{
16700 pResult->ar32[1] = puSrc1->ar32[1];
16701 pResult->ar32[2] = puSrc1->ar32[2];
16702 pResult->ar32[3] = puSrc1->ar32[3];
16703 return iemAImpl_rsqrt_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16704}
16705#endif
16706
16707
16708/**
16709 * RCPPS
16710 */
16711#ifdef IEM_WITHOUT_ASSEMBLY
16712static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16713{
16714 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16715 return fMxcsr;
16716
16717 RTFLOAT32U r32Src;
16718 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16719 if (RTFLOAT32U_IS_ZERO(&r32Src))
16720 {
16721 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16722 return fMxcsr;
16723 }
16724
16725 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16726 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16727 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16728}
16729
16730
16731IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rcpps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16732{
16733 RT_NOREF(puSrc1);
16734
16735 return iemAImpl_rcp_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16736 | iemAImpl_rcp_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16737 | iemAImpl_rcp_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16738 | iemAImpl_rcp_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16739}
16740
16741
16742/**
16743 * RCPSS
16744 */
16745IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rcpss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16746{
16747 pResult->ar32[1] = puSrc1->ar32[1];
16748 pResult->ar32[2] = puSrc1->ar32[2];
16749 pResult->ar32[3] = puSrc1->ar32[3];
16750 return iemAImpl_rcp_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16751}
16752#endif
16753
16754
16755/**
16756 * ADDSUBPS
16757 */
16758#ifdef IEM_WITHOUT_ASSEMBLY
16759IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsubps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16760{
16761 RT_NOREF(puSrc1);
16762
16763 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16764 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16765 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16766 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16767}
16768#endif
16769
16770
16771/**
16772 * ADDSUBPD
16773 */
16774#ifdef IEM_WITHOUT_ASSEMBLY
16775IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsubpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16776{
16777 RT_NOREF(puSrc1);
16778
16779 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16780 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16781}
16782#endif
16783
16784
16785/**
16786 * CVTPD2PS
16787 */
16788#ifdef IEM_WITHOUT_ASSEMBLY
16789static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16790{
16791 RTFLOAT64U r64Src1;
16792 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16793
16794 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16795 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16796 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16797}
16798
16799
16800IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2ps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16801{
16802 RT_NOREF(puSrc1);
16803
16804 pResult->au32[2] = 0;
16805 pResult->au32[3] = 0;
16806 return iemAImpl_cvtpd2ps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar64[0])
16807 | iemAImpl_cvtpd2ps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar64[1]);
16808}
16809#endif
16810
16811
16812/**
16813 * CVTPS2PD
16814 */
16815#ifdef IEM_WITHOUT_ASSEMBLY
16816static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16817{
16818 RTFLOAT32U r32Src1;
16819 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16820
16821 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16822 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16823 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16824}
16825
16826
16827IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2pd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16828{
16829 RT_NOREF(puSrc1);
16830
16831 return iemAImpl_cvtps2pd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc2->ar32[0])
16832 | iemAImpl_cvtps2pd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar32[1]);
16833}
16834#endif
16835
16836
16837/**
16838 * CVTDQ2PS
16839 */
16840#ifdef IEM_WITHOUT_ASSEMBLY
16841static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16842{
16843 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16844 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16845 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16846}
16847
16848
16849IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtdq2ps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16850{
16851 RT_NOREF(puSrc1);
16852
16853 return iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[0], uMxCsrIn, puSrc2->ai32[0])
16854 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[1], uMxCsrIn, puSrc2->ai32[1])
16855 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[2], uMxCsrIn, puSrc2->ai32[2])
16856 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[3], uMxCsrIn, puSrc2->ai32[3]);
16857}
16858#endif
16859
16860
16861/**
16862 * CVTPS2DQ
16863 */
16864#ifdef IEM_WITHOUT_ASSEMBLY
16865static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16866{
16867 RTFLOAT32U r32Src;
16868 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16869
16870 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16871 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16872 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16873}
16874
16875
16876IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16877{
16878 RT_NOREF(puSrc1);
16879
16880 return iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar32[0])
16881 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar32[1])
16882 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[2], uMxCsrIn, &puSrc2->ar32[2])
16883 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[3], uMxCsrIn, &puSrc2->ar32[3]);
16884}
16885#endif
16886
16887
16888/**
16889 * CVTTPS2DQ
16890 */
16891#ifdef IEM_WITHOUT_ASSEMBLY
16892static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16893{
16894 RTFLOAT32U r32Src;
16895 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16896
16897 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16898 SoftState.roundingMode = softfloat_round_minMag;
16899 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16900 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16901}
16902
16903
16904IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttps2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16905{
16906 RT_NOREF(puSrc1);
16907
16908 return iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar32[0])
16909 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar32[1])
16910 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[2], uMxCsrIn, &puSrc2->ar32[2])
16911 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[3], uMxCsrIn, &puSrc2->ar32[3]);
16912}
16913#endif
16914
16915
16916/**
16917 * CVTTPD2DQ
16918 */
16919#ifdef IEM_WITHOUT_ASSEMBLY
16920static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16921{
16922 RTFLOAT64U r64Src;
16923 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16924
16925 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16926 SoftState.roundingMode = softfloat_round_minMag;
16927 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16928 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16929}
16930
16931
16932IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttpd2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16933{
16934 RT_NOREF(puSrc1);
16935
16936 pResult->au64[1] = 0;
16937 return iemAImpl_cvttpd2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar64[0])
16938 | iemAImpl_cvttpd2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar64[1]);
16939}
16940#endif
16941
16942
16943/**
16944 * CVTDQ2PD
16945 */
16946#ifdef IEM_WITHOUT_ASSEMBLY
16947static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16948{
16949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16950 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16951 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16952}
16953
16954
16955IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtdq2pd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16956{
16957 RT_NOREF(puSrc1);
16958
16959 return iemAImpl_cvtdq2pd_u128_worker(&pResult->ar64[0], uMxCsrIn, puSrc2->ai32[0])
16960 | iemAImpl_cvtdq2pd_u128_worker(&pResult->ar64[1], uMxCsrIn, puSrc2->ai32[1]);
16961}
16962#endif
16963
16964
16965/**
16966 * CVTPD2DQ
16967 */
16968#ifdef IEM_WITHOUT_ASSEMBLY
16969static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16970{
16971 RTFLOAT64U r64Src;
16972 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16973
16974 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16975 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16976 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16977}
16978
16979
16980IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16981{
16982 RT_NOREF(puSrc1);
16983
16984 pResult->au64[1] = 0;
16985 return iemAImpl_cvtpd2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar64[0])
16986 | iemAImpl_cvtpd2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar64[1]);
16987}
16988#endif
16989
16990
16991/**
16992 * [V]SHUFPS
16993 */
16994#ifdef IEM_WITHOUT_ASSEMBLY
16995IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16996{
16997 RTUINT128U const uSrc1 = *puDst;
16998 RTUINT128U const uSrc2 = *puSrc;
16999 ASMCompilerBarrier();
17000 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17001 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17002 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17003 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17004}
17005#endif
17006
17007
17008IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17009{
17010 RTUINT128U const uSrc1 = *puSrc1;
17011 RTUINT128U const uSrc2 = *puSrc2;
17012 ASMCompilerBarrier();
17013 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17014 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17015 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17016 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17017}
17018
17019
17020IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17021{
17022 RTUINT256U const uSrc1 = *puSrc1;
17023 RTUINT256U const uSrc2 = *puSrc2;
17024 ASMCompilerBarrier();
17025 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17026 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17027 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17028 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17029
17030 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
17031 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
17032 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
17033 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
17034}
17035
17036
17037/**
17038 * [V]SHUFPD
17039 */
17040#ifdef IEM_WITHOUT_ASSEMBLY
17041IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17042{
17043 RTUINT128U const uSrc1 = *puDst;
17044 RTUINT128U const uSrc2 = *puSrc;
17045 ASMCompilerBarrier();
17046 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17047 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17048}
17049#endif
17050
17051
17052IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17053{
17054 RTUINT128U const uSrc1 = *puSrc1;
17055 RTUINT128U const uSrc2 = *puSrc2;
17056 ASMCompilerBarrier();
17057 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17058 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17059}
17060
17061
17062IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17063{
17064 RTUINT256U const uSrc1 = *puSrc1;
17065 RTUINT256U const uSrc2 = *puSrc2;
17066 ASMCompilerBarrier();
17067 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17068 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17069 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
17070 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
17071}
17072
17073
17074/*
17075 * PHMINPOSUW / VPHMINPOSUW
17076 */
17077IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17078{
17079 uint16_t u16Min = puSrc->au16[0];
17080 uint8_t idxMin = 0;
17081
17082 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
17083 if (puSrc->au16[i] < u16Min)
17084 {
17085 u16Min = puSrc->au16[i];
17086 idxMin = i;
17087 }
17088
17089 puDst->au64[0] = 0;
17090 puDst->au64[1] = 0;
17091 puDst->au16[0] = u16Min;
17092 puDst->au16[1] = idxMin;
17093}
17094
17095
17096IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17097{
17098 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
17099}
17100
17101
17102/**
17103 * VPERMILPS
17104 */
17105#ifdef IEM_WITHOUT_ASSEMBLY
17106IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17107{
17108 RTUINT128U const uSrc = *puSrc;
17109 ASMCompilerBarrier();
17110
17111 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17112 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17113 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17114 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17115}
17116
17117
17118IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17119{
17120 RTUINT256U const uSrc = *puSrc;
17121 ASMCompilerBarrier();
17122
17123 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17124 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17125 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17126 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17127
17128 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17129 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17130 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17131 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17132}
17133
17134IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17135{
17136 RTUINT128U const uSrc1 = *puSrc1;
17137 RTUINT128U const uSrc2 = *puSrc2;
17138 ASMCompilerBarrier();
17139
17140 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17141 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17142 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17143 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17144}
17145
17146IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17147{
17148 RTUINT256U const uSrc1 = *puSrc1;
17149 RTUINT256U const uSrc2 = *puSrc2;
17150 ASMCompilerBarrier();
17151
17152 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17153 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17154 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17155 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17156
17157 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17158 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17159 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17160 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17161}
17162#endif
17163
17164
17165IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17166{
17167 RTUINT128U const uSrc = *puSrc;
17168 ASMCompilerBarrier();
17169
17170 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17171 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17172 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17173 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17174}
17175
17176
17177IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17178{
17179 RTUINT256U const uSrc = *puSrc;
17180 ASMCompilerBarrier();
17181
17182 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17183 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17184 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17185 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17186
17187 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17188 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17189 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17190 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17191}
17192
17193IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17194{
17195 RTUINT128U const uSrc1 = *puSrc1;
17196 RTUINT128U const uSrc2 = *puSrc2;
17197 ASMCompilerBarrier();
17198
17199 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17200 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17201 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17202 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17203}
17204
17205IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17206{
17207 RTUINT256U const uSrc1 = *puSrc1;
17208 RTUINT256U const uSrc2 = *puSrc2;
17209 ASMCompilerBarrier();
17210
17211 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17212 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17213 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17214 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17215
17216 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17217 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17218 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17219 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17220}
17221
17222
17223/**
17224 * VPERMILPD
17225 */
17226#ifdef IEM_WITHOUT_ASSEMBLY
17227IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17228{
17229 RTUINT128U const uSrc = *puSrc;
17230 ASMCompilerBarrier();
17231
17232 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17233 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17234}
17235
17236
17237IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17238{
17239 RTUINT256U const uSrc = *puSrc;
17240 ASMCompilerBarrier();
17241
17242 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17243 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17244
17245 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17246 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17247}
17248
17249IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17250{
17251 RTUINT128U const uSrc1 = *puSrc1;
17252 RTUINT128U const uSrc2 = *puSrc2;
17253 ASMCompilerBarrier();
17254
17255 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17256 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17257}
17258
17259IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17260{
17261 RTUINT256U const uSrc1 = *puSrc1;
17262 RTUINT256U const uSrc2 = *puSrc2;
17263 ASMCompilerBarrier();
17264
17265 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17266 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17267
17268 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17269 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17270}
17271#endif
17272
17273
17274IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17275{
17276 RTUINT128U const uSrc = *puSrc;
17277 ASMCompilerBarrier();
17278
17279 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17280 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17281}
17282
17283
17284IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17285{
17286 RTUINT256U const uSrc = *puSrc;
17287 ASMCompilerBarrier();
17288
17289 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17290 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17291
17292 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17293 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17294}
17295
17296IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17297{
17298 RTUINT128U const uSrc1 = *puSrc1;
17299 RTUINT128U const uSrc2 = *puSrc2;
17300 ASMCompilerBarrier();
17301
17302 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17303 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17304}
17305
17306IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17307{
17308 RTUINT256U const uSrc1 = *puSrc1;
17309 RTUINT256U const uSrc2 = *puSrc2;
17310 ASMCompilerBarrier();
17311
17312 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17313 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17314
17315 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17316 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17317}
17318
17319
17320/*
17321 * [V]PBLENDVB
17322 */
17323IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17324{
17325 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17326 if (puMask->au8[i] & RT_BIT(7))
17327 puDst->au8[i] = puSrc->au8[i];
17328}
17329
17330
17331IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17332{
17333 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17334 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17335}
17336
17337
17338IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17339{
17340 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17341 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17342}
17343
17344
17345/*
17346 * [V]BLENDVPS
17347 */
17348IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17349{
17350 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17351 if (puMask->au32[i] & RT_BIT_32(31))
17352 puDst->au32[i] = puSrc->au32[i];
17353}
17354
17355
17356IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17357{
17358 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17359 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17360}
17361
17362
17363IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17364{
17365 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17366 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17367}
17368
17369
17370/*
17371 * [V]BLENDVPD
17372 */
17373IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17374{
17375 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
17376 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
17377}
17378
17379
17380IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17381{
17382 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17383 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17384}
17385
17386
17387IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17388{
17389 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17390 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17391}
17392
17393
17394/**
17395 * [V]PALIGNR
17396 */
17397IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
17398{
17399 uint64_t const u64Src1 = *pu64Dst;
17400 ASMCompilerBarrier();
17401
17402 if (bEvil >= 16)
17403 *pu64Dst = 0;
17404 else if (bEvil >= 8)
17405 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
17406 else
17407 {
17408 uint8_t cShift = bEvil * 8;
17409 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
17410 | (u64Src2 >> cShift);
17411 }
17412}
17413
17414
17415IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17416{
17417 RTUINT128U const uSrc1 = *puDst;
17418 RTUINT128U const uSrc2 = *puSrc;
17419 ASMCompilerBarrier();
17420
17421 puDst->au64[0] = 0;
17422 puDst->au64[1] = 0;
17423 if (bEvil >= 32)
17424 { /* Everything stays 0. */ }
17425 else if (bEvil >= 16)
17426 {
17427 bEvil -= 16;
17428 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17429 puDst->au8[i - bEvil] = uSrc1.au8[i];
17430 }
17431 else
17432 {
17433 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17434 puDst->au8[i] = uSrc2.au8[i + bEvil];
17435 for (uint8_t i = 0; i < bEvil; i++)
17436 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17437 }
17438}
17439
17440
17441IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17442{
17443 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17444 RTUINT128U const uSrc2 = *puSrc2;
17445 ASMCompilerBarrier();
17446
17447 puDst->au64[0] = 0;
17448 puDst->au64[1] = 0;
17449 if (bEvil >= 32)
17450 { /* Everything stays 0. */ }
17451 else if (bEvil >= 16)
17452 {
17453 bEvil -= 16;
17454 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17455 puDst->au8[i - bEvil] = uSrc1.au8[i];
17456 }
17457 else
17458 {
17459 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17460 puDst->au8[i] = uSrc2.au8[i + bEvil];
17461 for (uint8_t i = 0; i < bEvil; i++)
17462 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17463 }
17464}
17465
17466
17467IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17468{
17469 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17470 RTUINT256U const uSrc2 = *puSrc2;
17471 ASMCompilerBarrier();
17472
17473 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
17474 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
17475}
17476
17477
17478/**
17479 * [V]PBLENDW
17480 */
17481IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17482{
17483 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17484 if (bEvil & RT_BIT(i))
17485 puDst->au16[i] = puSrc->au16[i];
17486}
17487
17488
17489IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17490{
17491 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17492 if (bEvil & RT_BIT(i))
17493 puDst->au16[i] = puSrc2->au16[i];
17494 else
17495 puDst->au16[i] = puSrc1->au16[i];
17496}
17497
17498
17499IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17500{
17501 for (uint8_t i = 0; i < 8; i++)
17502 if (bEvil & RT_BIT(i))
17503 {
17504 puDst->au16[ i] = puSrc2->au16[ i];
17505 puDst->au16[8 + i] = puSrc2->au16[8 + i];
17506 }
17507 else
17508 {
17509 puDst->au16[ i] = puSrc1->au16[ i];
17510 puDst->au16[8 + i] = puSrc1->au16[8 + i];
17511 }
17512}
17513
17514
17515/**
17516 * [V]PBLENDD
17517 */
17518IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17519{
17520 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17521 if (bEvil & RT_BIT(i))
17522 puDst->au32[i] = puSrc2->au32[i];
17523 else
17524 puDst->au32[i] = puSrc1->au32[i];
17525}
17526
17527
17528IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17529{
17530 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17531 if (bEvil & RT_BIT(i))
17532 puDst->au32[i] = puSrc2->au32[i];
17533 else
17534 puDst->au32[i] = puSrc1->au32[i];
17535}
17536
17537
17538/**
17539 * [V]BLENDPS
17540 */
17541IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17542{
17543 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17544 if (bEvil & RT_BIT(i))
17545 puDst->au32[i] = puSrc->au32[i];
17546}
17547
17548
17549IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17550{
17551 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17552 if (bEvil & RT_BIT(i))
17553 puDst->au32[i] = puSrc2->au32[i];
17554 else
17555 puDst->au32[i] = puSrc1->au32[i];
17556}
17557
17558
17559IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17560{
17561 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17562 if (bEvil & RT_BIT(i))
17563 puDst->au32[i] = puSrc2->au32[i];
17564 else
17565 puDst->au32[i] = puSrc1->au32[i];
17566}
17567
17568
17569/**
17570 * [V]BLENDPD
17571 */
17572IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17573{
17574 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17575 if (bEvil & RT_BIT(i))
17576 puDst->au64[i] = puSrc->au64[i];
17577}
17578
17579
17580IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17581{
17582 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17583 if (bEvil & RT_BIT(i))
17584 puDst->au64[i] = puSrc2->au64[i];
17585 else
17586 puDst->au64[i] = puSrc1->au64[i];
17587}
17588
17589
17590IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17591{
17592 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17593 if (bEvil & RT_BIT(i))
17594 puDst->au64[i] = puSrc2->au64[i];
17595 else
17596 puDst->au64[i] = puSrc1->au64[i];
17597}
17598
17599
17600/**
17601 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
17602 */
17603
17604static uint8_t iemAImpl_aes_sbox[] = {
17605 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
17606 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
17607 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
17608 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
17609 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
17610 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
17611 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
17612 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
17613 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
17614 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
17615 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
17616 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17617 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
17618 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
17619 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
17620 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
17621};
17622
17623/* The InvS-Box lookup table. */
17624static uint8_t iemAImpl_aes_inv_sbox[] = {
17625 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
17626 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
17627 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
17628 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
17629 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
17630 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
17631 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
17632 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
17633 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
17634 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
17635 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
17636 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
17637 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
17638 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
17639 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
17640 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
17641};
17642
17643/* The ShiftRows lookup table. */
17644static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
17645 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
17646};
17647
17648/* The InvShiftRows lookup table. */
17649static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
17650 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
17651};
17652
17653static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
17654{
17655 RTUINT128U uVal;
17656 int i;
17657
17658 for (i = 0; i < 16; ++i)
17659 uVal.au8[i] = abSubst[puSrc->au8[i]];
17660
17661 return uVal;
17662}
17663
17664static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
17665{
17666 return (u << 1) ^ (((u >> 7) & 1) * 27);
17667}
17668
17669static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
17670{
17671 RTUINT128U uVal;
17672 int i;
17673 uint8_t tmp;
17674
17675 for (i = 0; i < 16; i += 4) {
17676 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
17677 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
17678 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
17679 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
17680 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
17681 }
17682
17683 return uVal;
17684}
17685
17686static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
17687{
17688 RTUINT128U uVal;
17689 int i;
17690
17691 for (i = 0; i < 16; ++i)
17692 uVal.au8[i] = puSrc->au8[abShift[i]];
17693
17694 return uVal;
17695}
17696
17697static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
17698{
17699 uint8_t val;
17700
17701 val = ((b >> 0) & 1) * a;
17702 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
17703 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
17704 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
17705 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
17706
17707 return val;
17708}
17709
17710static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
17711{
17712 RTUINT128U uVal;
17713 int i;
17714
17715 for (i = 0; i < 16; i += 4) {
17716 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
17717 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
17718 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
17719 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
17720 }
17721
17722 return uVal;
17723}
17724
17725static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
17726{
17727 RTUINT32U uTmp;
17728
17729 uTmp.au32[0] = w;
17730 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
17731 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
17732 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
17733 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
17734
17735 return uTmp.au32[0];
17736}
17737
17738static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
17739{
17740 return (w << 24) | (w >> 8);
17741}
17742
17743/**
17744 * [V]AESKEYGENASSIST
17745 */
17746IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
17747{
17748 RTUINT128U uTmp;
17749 uint32_t uRCon = bImm; /* Round constant. */
17750
17751 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
17752 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
17753 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
17754 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
17755
17756 *puDst = uTmp;
17757}
17758
17759
17760/**
17761 * [V]AESIMC
17762 */
17763IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17764{
17765 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
17766}
17767
17768
17769/**
17770 * [V]AESENC
17771 */
17772IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17773{
17774 RTUINT128U uTmp;
17775
17776 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17777 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17778 uTmp = iemAImpl_aes_mix_col(&uTmp);
17779 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17780 uTmp.au64[1] ^= puSrc->au64[1];
17781
17782 *puDst = uTmp;
17783}
17784
17785
17786/**
17787 * [V]AESENCLAST
17788 */
17789IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17790{
17791 RTUINT128U uTmp;
17792
17793 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17794 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17795 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17796 uTmp.au64[1] ^= puSrc->au64[1];
17797
17798 *puDst = uTmp;
17799}
17800
17801
17802/**
17803 * [V]AESDEC
17804 */
17805IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17806{
17807 RTUINT128U uTmp;
17808
17809 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17810 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17811 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17812 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17813 uTmp.au64[1] ^= puSrc->au64[1];
17814
17815 *puDst = uTmp;
17816}
17817
17818
17819/**
17820 * [V]AESDECLAST
17821 */
17822IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17823{
17824 RTUINT128U uTmp;
17825
17826 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17827 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17828 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17829 uTmp.au64[1] ^= puSrc->au64[1];
17830
17831 *puDst = uTmp;
17832}
17833
17834
17835/**
17836 * [V]PCMPISTRI
17837 */
17838
17839/**
17840 * Does the comparisons based on the mode and source input format.
17841 */
17842static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17843{
17844#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17845 do \
17846 { \
17847 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17848 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17849 { \
17850 switch (a_bAggOp) \
17851 { \
17852 case 0: \
17853 case 2: \
17854 case 3: \
17855 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17856 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17857 break; \
17858 case 1: \
17859 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17860 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17861 break; \
17862 default: \
17863 AssertReleaseFailed(); \
17864 } \
17865 } \
17866 } while(0)
17867
17868 uint8_t bAggOp = (bImm >> 2) & 0x3;
17869 switch (bImm & 0x3)
17870 {
17871 case 0:
17872 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17873 break;
17874 case 1:
17875 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17876 break;
17877 case 2:
17878 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17879 break;
17880 case 3:
17881 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17882 break;
17883 default:
17884 AssertReleaseFailed();
17885 }
17886#undef PCMPXSTRX_CMP_CASE
17887}
17888
17889static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17890{
17891 if (bImm & 0x1)
17892 {
17893 /* Words -> 8 elements. */
17894 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17895 if (puSrc->au16[i] == 0)
17896 return i;
17897
17898 return 8;
17899 }
17900 else
17901 {
17902 /* Bytes -> 16 elements. */
17903 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17904 if (puSrc->au8[i] == 0)
17905 return i;
17906
17907 return 16;
17908 }
17909}
17910
17911static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17912{
17913 if (bImm & 0x1)
17914 {
17915 if (i64Len > -8 && i64Len < 8)
17916 return RT_ABS(i64Len);
17917
17918 return 8;
17919 }
17920 else
17921 {
17922 if (i64Len > -16 && i64Len < 16)
17923 return RT_ABS(i64Len);
17924
17925 return 16;
17926 }
17927}
17928
17929/**
17930 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17931 */
17932static const bool g_afCmpOverride[4][4] =
17933{
17934 /* xmm1 AND xmm2/m128 invalid, xmm1 invalid BUT xmm2/m128 valid, xmm1 valid BUT xmm2/m128 invalid, unused dummy/padding for parfait */
17935 { false, false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17936 { false, false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17937 { true, false, false, false }, /* Imm8[3:2] = 10b (equal each) */
17938 { true, true, false, false }, /* Imm8[3:2] = 11b (equal ordered) */
17939};
17940
17941DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17942{
17943 if (fSrc1Valid && fSrc2Valid)
17944 return fCmpRes;
17945
17946 uint8_t const bSrc1Valid = fSrc1Valid ? 2 : 0;
17947 uint8_t const bSrc2Valid = fSrc2Valid ? 1 : 0;
17948 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17949}
17950
17951static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
17952{
17953 uint8_t bAggOp = (bImm >> 2) & 0x3;
17954 uint16_t u16Result = 0;
17955
17956 switch (bAggOp)
17957 {
17958 case 0: /* Equal any */
17959 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17960 {
17961 uint16_t u16Res = 0;
17962 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
17963 {
17964 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17965 idxSrc1 < idxLen1,
17966 idxSrc2 < idxLen2,
17967 bAggOp))
17968 {
17969 u16Res = RT_BIT(idxSrc2);
17970 break;
17971 }
17972 }
17973
17974 u16Result |= u16Res;
17975 }
17976 break;
17977
17978 case 1: /* Ranges */
17979 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17980 {
17981 uint16_t u16Res = 0;
17982 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
17983 {
17984 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17985 idxSrc1 < idxLen1,
17986 idxSrc2 < idxLen2,
17987 bAggOp)
17988 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
17989 (idxSrc1 + 1) < idxLen1,
17990 idxSrc2 < idxLen2,
17991 bAggOp))
17992 {
17993 u16Res = RT_BIT(idxSrc2);
17994 break;
17995 }
17996 }
17997
17998 u16Result |= u16Res;
17999 }
18000 break;
18001
18002 case 2: /* Equal each */
18003 for (uint8_t i = 0; i < cElems; i++)
18004 {
18005 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
18006 i < idxLen1,
18007 i < idxLen2,
18008 bAggOp))
18009 u16Result |= RT_BIT(i);
18010 }
18011 break;
18012
18013 case 3: /* Equal ordered */
18014 u16Result = 0;
18015 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
18016 {
18017 uint16_t u16Res = RT_BIT(idxSrc2);
18018 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
18019 {
18020 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
18021 idxSrc1 < idxLen1,
18022 k < idxLen2,
18023 bAggOp))
18024 {
18025 u16Res = 0;
18026 break;
18027 }
18028 }
18029
18030 u16Result |= u16Res;
18031 }
18032 break;
18033 }
18034
18035 /* Polarity selection. */
18036 switch ((bImm >> 4) & 0x3)
18037 {
18038 case 0:
18039 case 2:
18040 /* Nothing to do. */
18041 break;
18042 case 1:
18043 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
18044 break;
18045 case 3:
18046 u16Result ^= RT_BIT(idxLen2) - 1;
18047 break;
18048 default:
18049 AssertReleaseFailed();
18050 }
18051
18052 return u16Result;
18053}
18054
18055DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
18056{
18057 uint32_t fEFlags = 0;
18058
18059 if (u16Result)
18060 fEFlags |= X86_EFL_CF;
18061 if (cLen2 < cElems)
18062 fEFlags |= X86_EFL_ZF;
18063 if (cLen1 < cElems)
18064 fEFlags |= X86_EFL_SF;
18065 if (u16Result & 0x1)
18066 fEFlags |= X86_EFL_OF;
18067 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
18068}
18069
18070DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
18071 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
18072{
18073 bool afCmpRes[16][16];
18074 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18075
18076 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
18077 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
18078 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
18079
18080 return u16Result;
18081}
18082
18083DECL_FORCE_INLINE(uint32_t) iemAImpl_pcmpxstri_set_result_index(uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18084{
18085 uint32_t u32Ecx;
18086 if (bImm & RT_BIT(6))
18087 {
18088 /* Index for MSB set. */
18089 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
18090 if (idxMsb)
18091 u32Ecx = idxMsb - 1;
18092 else
18093 u32Ecx = cElems;
18094 }
18095 else
18096 {
18097 /* Index for LSB set. */
18098 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
18099 if (idxLsb)
18100 u32Ecx = idxLsb - 1;
18101 else
18102 u32Ecx = cElems;
18103 }
18104
18105 return u32Ecx;
18106}
18107
18108IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pEFlags, PCRTUINT128U pSrc1, PCRTUINT128U pSrc2, uint8_t bEvil))
18109{
18110 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18111 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(pSrc1, bEvil);
18112 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(pSrc2, bEvil);
18113
18114 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, pSrc1, pSrc2, cLen1, cLen2, bEvil);
18115 return iemAImpl_pcmpxstri_set_result_index(u16Result, cElems, bEvil);
18116}
18117
18118
18119/**
18120 * [V]PCMPESTRI
18121 */
18122IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18123{
18124 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18125 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18126 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18127
18128 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18129 *pu32Ecx = iemAImpl_pcmpxstri_set_result_index(u16Result, cElems, bEvil);
18130}
18131
18132
18133/**
18134 * [V]PCMPISTRM
18135 */
18136DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18137{
18138 if (bImm & RT_BIT(6))
18139 {
18140 /* Generate a mask. */
18141 if (cElems == 8)
18142 {
18143 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18144 if (u16Result & RT_BIT(i))
18145 puDst->au16[i] = 0xffff;
18146 else
18147 puDst->au16[i] = 0;
18148 }
18149 else
18150 {
18151 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
18152 if (u16Result & RT_BIT(i))
18153 puDst->au8[i] = 0xff;
18154 else
18155 puDst->au8[i] = 0;
18156 }
18157 }
18158 else
18159 {
18160 /* Store the result. */
18161 puDst->au64[0] = u16Result;
18162 puDst->au64[1] = 0;
18163 }
18164}
18165
18166IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18167{
18168 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18169 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18170 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18171
18172 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18173 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18174}
18175
18176
18177/**
18178 * [V]PCMPESTRM
18179 */
18180IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18181{
18182 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18183 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18184 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18185
18186 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18187 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18188}
18189
18190
18191/*
18192 * [V]PCLMULQDQ
18193 */
18194IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18195{
18196 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
18197}
18198
18199
18200IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18201{
18202 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
18203 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
18204
18205 puDst->au64[0] = 0;
18206 puDst->au64[1] = 0;
18207
18208 /*
18209 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
18210 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
18211 * and squeeze out some optimizations.
18212 */
18213 if (uSrc1 & 0x1)
18214 puDst->au64[0] = uSrc2;
18215
18216 uSrc1 >>= 1;
18217
18218 uint8_t iDigit = 1;
18219 while (uSrc1)
18220 {
18221 if (uSrc1 & 0x1)
18222 {
18223 puDst->au64[0] ^= (uSrc2 << iDigit);
18224 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
18225 }
18226
18227 uSrc1 >>= 1;
18228 iDigit++;
18229 }
18230}
18231
18232
18233/**
18234 * [V]MOVMSKPS
18235 */
18236#ifdef IEM_WITHOUT_ASSEMBLY
18237IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18238{
18239 *pu8Dst = puSrc->au32[0] >> 31;
18240 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18241 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18242 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18243}
18244
18245#endif
18246
18247IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18248{
18249 *pu8Dst = puSrc->au32[0] >> 31;
18250 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18251 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18252 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18253}
18254
18255
18256IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18257{
18258 *pu8Dst = puSrc->au32[0] >> 31;
18259 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18260 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18261 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18262 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
18263 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
18264 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
18265 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
18266}
18267
18268
18269/**
18270 * [V]MOVMSKPD
18271 */
18272#ifdef IEM_WITHOUT_ASSEMBLY
18273IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18274{
18275 *pu8Dst = puSrc->au64[0] >> 63;
18276 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18277}
18278
18279#endif
18280
18281IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18282{
18283 *pu8Dst = puSrc->au64[0] >> 63;
18284 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18285}
18286
18287
18288IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18289{
18290 *pu8Dst = puSrc->au64[0] >> 63;
18291 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18292 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
18293 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
18294}
18295
18296
18297/**
18298 * CVTTSD2SI
18299 */
18300#ifdef IEM_WITHOUT_ASSEMBLY
18301IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttsd2si_i32_r64,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint64_t *pu64Src))
18302{
18303 RTFLOAT64U r64Src;
18304
18305 r64Src.u = *pu64Src;
18306 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18307
18308 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18309 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18310 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18311}
18312
18313
18314IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttsd2si_i64_r64,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint64_t *pu64Src))
18315{
18316 RTFLOAT64U r64Src;
18317
18318 r64Src.u = *pu64Src;
18319 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18320
18321 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18322 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18323 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18324}
18325#endif
18326
18327
18328/**
18329 * CVTSD2SI
18330 */
18331#ifdef IEM_WITHOUT_ASSEMBLY
18332IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2si_i32_r64,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint64_t *pu64Src))
18333{
18334 RTFLOAT64U r64Src;
18335
18336 r64Src.u = *pu64Src;
18337 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18338
18339 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18340 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18341 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18342}
18343
18344
18345IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2si_i64_r64,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint64_t *pu64Src))
18346{
18347 RTFLOAT64U r64Src;
18348
18349 r64Src.u = *pu64Src;
18350 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18351
18352 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18353 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18354 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18355}
18356#endif
18357
18358
18359/**
18360 * CVTTSS2SI
18361 */
18362#ifdef IEM_WITHOUT_ASSEMBLY
18363IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttss2si_i32_r32,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint32_t *pu32Src))
18364{
18365 RTFLOAT32U r32Src;
18366
18367 r32Src.u = *pu32Src;
18368 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18369
18370 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18371 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18372 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18373}
18374
18375
18376IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttss2si_i64_r32,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint32_t *pu32Src))
18377{
18378 RTFLOAT32U r32Src;
18379
18380 r32Src.u = *pu32Src;
18381 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18382
18383 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18384 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18385 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18386}
18387#endif
18388
18389
18390/**
18391 * CVTSS2SI
18392 */
18393#ifdef IEM_WITHOUT_ASSEMBLY
18394IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2si_i32_r32,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint32_t *pu32Src))
18395{
18396 RTFLOAT32U r32Src;
18397
18398 r32Src.u = *pu32Src;
18399 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18400
18401 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18402 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18403 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18404}
18405
18406
18407IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2si_i64_r32,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint32_t *pu32Src))
18408{
18409 RTFLOAT32U r32Src;
18410
18411 r32Src.u = *pu32Src;
18412 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18413
18414 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18415 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18416 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18417}
18418#endif
18419
18420
18421/**
18422 * CVTSI2SD
18423 */
18424#ifdef IEM_WITHOUT_ASSEMBLY
18425IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2sd_r64_i32,(uint32_t uMxCsrIn, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
18426{
18427 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18428 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
18429 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, uMxCsrIn);
18430}
18431
18432
18433IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2sd_r64_i64,(uint32_t uMxCsrIn, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
18434{
18435 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18436 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
18437 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, uMxCsrIn);
18438}
18439#endif
18440
18441
18442/**
18443 * CVTSI2SS
18444 */
18445#ifdef IEM_WITHOUT_ASSEMBLY
18446IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2ss_r32_i32,(uint32_t uMxCsrIn, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
18447{
18448 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18449 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
18450 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, uMxCsrIn);
18451}
18452
18453
18454IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2ss_r32_i64,(uint32_t uMxCsrIn, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
18455{
18456 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18457 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
18458 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, uMxCsrIn);
18459}
18460#endif
18461
18462
18463/**
18464 * [V]UCOMISS
18465 */
18466#ifdef IEM_WITHOUT_ASSEMBLY
18467IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ucomiss_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18468{
18469 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18470
18471 if (RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc2))
18472 {
18473 uMxCsrIn |= X86_MXCSR_IE;
18474 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18475 }
18476 else if (RTFLOAT32U_IS_QUIET_NAN(&uSrc1) || RTFLOAT32U_IS_QUIET_NAN(&uSrc2))
18477 {
18478 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18479 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18480 }
18481 else
18482 {
18483 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18484
18485 RTFLOAT32U r32Src1, r32Src2;
18486 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, uMxCsrIn, &uSrc1);
18487 fDe |= iemSsePrepareValueR32(&r32Src2, uMxCsrIn, &uSrc2);
18488
18489 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18490 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18491 if (f32_eq(f32Src1, f32Src2, &SoftState))
18492 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18493 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18494 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18495 /* else: GREATER_THAN 000 */
18496
18497 uMxCsrIn |= fDe;
18498 }
18499
18500 *pfEFlags = fEFlagsNew;
18501 return uMxCsrIn;
18502}
18503#endif
18504
18505IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vucomiss_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18506{
18507 return iemAImpl_ucomiss_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18508}
18509
18510
18511/**
18512 * [V]UCOMISD
18513 */
18514#ifdef IEM_WITHOUT_ASSEMBLY
18515IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ucomisd_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18516{
18517 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18518
18519 if (RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc2))
18520 {
18521 uMxCsrIn |= X86_MXCSR_IE;
18522 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18523 }
18524 else if (RTFLOAT64U_IS_QUIET_NAN(&uSrc1) || RTFLOAT64U_IS_QUIET_NAN(&uSrc2))
18525 {
18526 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18527 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18528 }
18529 else
18530 {
18531 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18532
18533 RTFLOAT64U r64Src1, r64Src2;
18534 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, uMxCsrIn, &uSrc1)
18535 | iemSsePrepareValueR64(&r64Src2, uMxCsrIn, &uSrc2);
18536
18537 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18538 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18539 if (f64_eq(f64Src1, f64Src2, &SoftState))
18540 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18541 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18542 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18543 /* else: GREATER_THAN 000 */
18544
18545 uMxCsrIn |= fDe;
18546 }
18547
18548 *pfEFlags = fEFlagsNew;
18549 return uMxCsrIn;
18550}
18551#endif
18552
18553IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vucomisd_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18554{
18555 return iemAImpl_ucomisd_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18556}
18557
18558
18559/**
18560 * [V]COMISS
18561 */
18562#ifdef IEM_WITHOUT_ASSEMBLY
18563IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_comiss_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18564{
18565 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18566
18567 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc2)
18568 || RTFLOAT32U_IS_QUIET_NAN(&uSrc1) || RTFLOAT32U_IS_QUIET_NAN(&uSrc2))
18569 {
18570 uMxCsrIn |= X86_MXCSR_IE;
18571 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18572 }
18573 else
18574 {
18575 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18576
18577 RTFLOAT32U r32Src1, r32Src2;
18578 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, uMxCsrIn, &uSrc1)
18579 | iemSsePrepareValueR32(&r32Src2, uMxCsrIn, &uSrc2);
18580
18581 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18582 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18583 if (f32_eq(f32Src1, f32Src2, &SoftState))
18584 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18585 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18586 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18587 /* else: GREATER_THAN 000 */
18588
18589 uMxCsrIn |= fDe;
18590 }
18591
18592 *pfEFlags = fEFlagsNew;
18593 return uMxCsrIn;
18594}
18595#endif
18596
18597
18598IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vcomiss_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18599{
18600 return iemAImpl_comiss_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18601}
18602
18603
18604/**
18605 * [V]COMISD
18606 */
18607#ifdef IEM_WITHOUT_ASSEMBLY
18608IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_comisd_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18609{
18610 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18611
18612 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc2)
18613 || RTFLOAT64U_IS_QUIET_NAN(&uSrc1) || RTFLOAT64U_IS_QUIET_NAN(&uSrc2))
18614 {
18615 uMxCsrIn |= X86_MXCSR_IE;
18616 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18617 }
18618 else
18619 {
18620 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18621
18622 RTFLOAT64U r64Src1, r64Src2;
18623 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, uMxCsrIn, &uSrc1);
18624 fDe |= iemSsePrepareValueR64(&r64Src2, uMxCsrIn, &uSrc2);
18625
18626 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18627 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18628 if (f64_eq(f64Src1, f64Src2, &SoftState))
18629 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18630 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18631 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18632 /* else: GREATER_THAN 000 */
18633
18634 uMxCsrIn |= fDe;
18635 }
18636
18637 *pfEFlags = fEFlagsNew;
18638 return uMxCsrIn;
18639}
18640#endif
18641
18642IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vcomisd_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18643{
18644 return iemAImpl_comisd_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18645}
18646
18647
18648/**
18649 * CMPPS / CMPPD / CMPSS / CMPSD
18650 */
18651#ifdef IEM_WITHOUT_ASSEMBLY
18652/**
18653 * A compare truth table entry.
18654 */
18655typedef struct CMPTRUTHTBLENTRY
18656{
18657 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
18658 bool fSignalsOnQNan;
18659 /** The boolean result when the input operands are unordered. */
18660 bool fUnordered;
18661 /** The boolean result when A = B. */
18662 bool fEqual;
18663 /** The boolean result when A < B. */
18664 bool fLowerThan;
18665 /** The boolean result when A > B. */
18666 bool fGreaterThan;
18667} CMPTRUTHTBLENTRY;
18668/** Pointer to a const truth table entry. */
18669typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
18670
18671
18672/** The compare truth table (indexed by immediate). */
18673static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
18674{
18675 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
18676 /* 00H (EQ_OQ) */ { false, false, true, false, false },
18677 /* 01H (LT_OS) */ { true, false, false, true, false },
18678 /* 02H (LE_OS) */ { true, false, true, true, false },
18679 /* 03H (UNORD_Q) */ { false, true, false, false, false },
18680 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
18681 /* 05H (NLT_US) */ { true, true, true, false, true },
18682 /* 06H (NLE_US) */ { true, true, false, false, true },
18683 /* 07H (ORQ_Q) */ { false, false, true, true, true },
18684 /** @todo AVX variants. */
18685};
18686
18687
18688static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
18689{
18690 bool fRes;
18691 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18692
18693 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
18694 {
18695 *pfMxcsr |= X86_MXCSR_IE;
18696 fRes = g_aCmpTbl[bEvil].fUnordered;
18697 }
18698 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
18699 {
18700 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18701 *pfMxcsr |= X86_MXCSR_IE;
18702 fRes = g_aCmpTbl[bEvil].fUnordered;
18703 }
18704 else
18705 {
18706 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18707
18708 RTFLOAT32U r32Src1, r32Src2;
18709 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
18710 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
18711
18712 *pfMxcsr |= fDe;
18713 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18714 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18715 if (f32_eq(f32Src1, f32Src2, &SoftState))
18716 fRes = g_aCmpTbl[bEvil].fEqual;
18717 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18718 fRes = g_aCmpTbl[bEvil].fLowerThan;
18719 else
18720 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18721 }
18722
18723 return fRes;
18724}
18725
18726
18727static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
18728{
18729 bool fRes;
18730 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18731
18732 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
18733 {
18734 *pfMxcsr |= X86_MXCSR_IE;
18735 fRes = g_aCmpTbl[bEvil].fUnordered;
18736 }
18737 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
18738 {
18739 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18740 *pfMxcsr |= X86_MXCSR_IE;
18741 fRes = g_aCmpTbl[bEvil].fUnordered;
18742 }
18743 else
18744 {
18745 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18746
18747 RTFLOAT64U r64Src1, r64Src2;
18748 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
18749 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
18750
18751 *pfMxcsr |= fDe;
18752 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18753 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18754 if (f64_eq(f64Src1, f64Src2, &SoftState))
18755 fRes = g_aCmpTbl[bEvil].fEqual;
18756 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18757 fRes = g_aCmpTbl[bEvil].fLowerThan;
18758 else
18759 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18760 }
18761
18762 return fRes;
18763}
18764
18765
18766IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpps_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18767{
18768 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18769 {
18770 if (iemAImpl_cmp_worker_r32(&uMxCsrIn, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18771 puDst->au32[i] = UINT32_MAX;
18772 else
18773 puDst->au32[i] = 0;
18774 }
18775
18776 return uMxCsrIn;
18777}
18778
18779
18780IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmppd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18781{
18782 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18783 {
18784 if (iemAImpl_cmp_worker_r64(&uMxCsrIn, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18785 puDst->au64[i] = UINT64_MAX;
18786 else
18787 puDst->au64[i] = 0;
18788 }
18789
18790 return uMxCsrIn;
18791}
18792
18793
18794IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpss_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18795{
18796 if (iemAImpl_cmp_worker_r32(&uMxCsrIn, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18797 puDst->au32[0] = UINT32_MAX;
18798 else
18799 puDst->au32[0] = 0;
18800
18801 puDst->au32[1] = pSrc->uSrc1.au32[1];
18802 puDst->au64[1] = pSrc->uSrc1.au64[1];
18803 return uMxCsrIn;
18804}
18805
18806
18807IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpsd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18808{
18809 if (iemAImpl_cmp_worker_r64(&uMxCsrIn, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18810 puDst->au64[0] = UINT64_MAX;
18811 else
18812 puDst->au64[0] = 0;
18813
18814 puDst->au64[1] = pSrc->uSrc1.au64[1];
18815 return uMxCsrIn;
18816}
18817#endif
18818
18819
18820/**
18821 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18822 */
18823
18824#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18825#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18826#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18827
18828#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18829
18830DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18831{
18832 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18833 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18834
18835 fMxcsr &= ~X86_MXCSR_RC_MASK;
18836 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18837 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18838}
18839
18840static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18841{
18842 RTFLOAT32U r32Src, r32Dst;
18843 float32_t f32Src;
18844 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18845 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18846
18847 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18848 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18849
18850 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18851 return r32Dst;
18852}
18853
18854static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18855{
18856 RTFLOAT64U r64Src, r64Dst;
18857 float64_t f64Src;
18858 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18859 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18860
18861 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18862 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18863
18864 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18865 return r64Dst;
18866}
18867
18868#ifdef IEM_WITHOUT_ASSEMBLY
18869IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundss_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18870{
18871 puDst->ar32[0] = iemAImpl_round_worker_r32(&uMxCsrIn, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18872 puDst->au32[1] = pSrc->uSrc1.au32[1];
18873 puDst->au64[1] = pSrc->uSrc1.au64[1];
18874 return uMxCsrIn;
18875}
18876
18877
18878IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundsd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18879{
18880 puDst->ar64[0] = iemAImpl_round_worker_r64(&uMxCsrIn, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18881 puDst->au64[1] = pSrc->uSrc1.au64[1];
18882 return uMxCsrIn;
18883}
18884#endif
18885
18886IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundps_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18887{
18888 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18889 {
18890 puDst->ar32[i] = iemAImpl_round_worker_r32(&uMxCsrIn, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18891 }
18892
18893 return uMxCsrIn;
18894}
18895
18896
18897IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundpd_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18898{
18899 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18900 {
18901 puDst->ar64[i] = iemAImpl_round_worker_r64(&uMxCsrIn, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18902 }
18903
18904 return uMxCsrIn;
18905}
18906
18907/**
18908 * CVTPD2PI
18909 */
18910#ifdef IEM_WITHOUT_ASSEMBLY
18911static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18912{
18913 RTFLOAT64U r64Src;
18914 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18915
18916 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18917 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18918 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18919}
18920
18921
18922IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18923{
18924 RTUINT64U u64Res;
18925 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[0], &pSrc->ar64[0]);
18926 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[1], &pSrc->ar64[1]);
18927
18928 *pu64Dst = u64Res.u;
18929 return fMxcsrOut;
18930}
18931#endif
18932
18933
18934/**
18935 * CVTTPD2PI
18936 */
18937#ifdef IEM_WITHOUT_ASSEMBLY
18938static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18939{
18940 RTFLOAT64U r64Src;
18941 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18942
18943 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18944 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18945 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18946}
18947
18948
18949IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttpd2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18950{
18951 RTUINT64U u64Res;
18952 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[0], &pSrc->ar64[0]);
18953 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[1], &pSrc->ar64[1]);
18954
18955 *pu64Dst = u64Res.u;
18956 return fMxcsrOut;
18957}
18958#endif
18959
18960
18961/**
18962 * CVTPI2PS
18963 */
18964#ifdef IEM_WITHOUT_ASSEMBLY
18965static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
18966{
18967 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18968 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
18969 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
18970}
18971
18972
18973IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpi2ps_u128,(uint32_t fMxCsrIn, PX86XMMREG pDst, uint64_t u64Src))
18974{
18975 RTUINT64U uSrc = { u64Src };
18976 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(fMxCsrIn, &pDst->ar32[0], uSrc.ai32[0]);
18977 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(fMxCsrIn, &pDst->ar32[1], uSrc.ai32[1]);
18978 return fMxcsrOut;
18979}
18980#endif
18981
18982
18983/**
18984 * CVTPI2PD
18985 */
18986#ifdef IEM_WITHOUT_ASSEMBLY
18987static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
18988{
18989 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18990 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
18991 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
18992}
18993
18994
18995IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpi2pd_u128,(uint32_t fMxCsrIn, PX86XMMREG pDst, uint64_t u64Src))
18996{
18997 RTUINT64U uSrc = { u64Src };
18998 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(fMxCsrIn, &pDst->ar64[0], uSrc.ai32[0]);
18999 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(fMxCsrIn, &pDst->ar64[1], uSrc.ai32[1]);
19000 return fMxcsrOut;
19001}
19002#endif
19003
19004
19005/**
19006 * CVTPS2PI
19007 */
19008#ifdef IEM_WITHOUT_ASSEMBLY
19009static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
19010{
19011 RTFLOAT32U r32Src;
19012 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
19013
19014 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19015 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
19016 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19017}
19018
19019
19020IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, uint64_t u64Src))
19021{
19022 RTUINT64U uDst;
19023 RTUINT64U uSrc = { u64Src };
19024 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(fMxCsrIn, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
19025 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(fMxCsrIn, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
19026 *pu64Dst = uDst.u;
19027 return fMxcsrOut;
19028}
19029#endif
19030
19031
19032/**
19033 * CVTTPS2PI
19034 */
19035#ifdef IEM_WITHOUT_ASSEMBLY
19036static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
19037{
19038 RTFLOAT32U r32Src;
19039 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
19040
19041 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19042 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
19043 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19044}
19045
19046
19047IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttps2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, uint64_t u64Src))
19048{
19049 RTUINT64U uDst;
19050 RTUINT64U uSrc = { u64Src };
19051 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(fMxCsrIn, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
19052 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(fMxCsrIn, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
19053 *pu64Dst = uDst.u;
19054 return fMxcsrOut;
19055}
19056#endif
19057
19058/**
19059 * RDRAND
19060 */
19061IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
19062{
19063 *puDst = 0;
19064 *pEFlags &= ~X86_EFL_STATUS_BITS;
19065 *pEFlags |= X86_EFL_CF;
19066}
19067
19068IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
19069{
19070 *puDst = 0;
19071 *pEFlags &= ~X86_EFL_STATUS_BITS;
19072 *pEFlags |= X86_EFL_CF;
19073}
19074
19075IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
19076{
19077 *puDst = 0;
19078 *pEFlags &= ~X86_EFL_STATUS_BITS;
19079 *pEFlags |= X86_EFL_CF;
19080}
19081
19082/**
19083 * RDSEED
19084 */
19085IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
19086{
19087 *puDst = 0;
19088 *pEFlags &= ~X86_EFL_STATUS_BITS;
19089 *pEFlags |= X86_EFL_CF;
19090}
19091
19092IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
19093{
19094 *puDst = 0;
19095 *pEFlags &= ~X86_EFL_STATUS_BITS;
19096 *pEFlags |= X86_EFL_CF;
19097}
19098
19099IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
19100{
19101 *puDst = 0;
19102 *pEFlags &= ~X86_EFL_STATUS_BITS;
19103 *pEFlags |= X86_EFL_CF;
19104}
19105
19106
19107/**
19108 * SHA1NEXTE
19109 */
19110IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19111{
19112 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
19113
19114 puDst->au32[0] = puSrc->au32[0];
19115 puDst->au32[1] = puSrc->au32[1];
19116 puDst->au32[2] = puSrc->au32[2];
19117 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
19118}
19119
19120/**
19121 * SHA1MSG1
19122 */
19123IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19124{
19125 uint32_t u32W0 = puDst->au32[3];
19126 uint32_t u32W1 = puDst->au32[2];
19127 uint32_t u32W2 = puDst->au32[1];
19128 uint32_t u32W3 = puDst->au32[0];
19129 uint32_t u32W4 = puSrc->au32[3];
19130 uint32_t u32W5 = puSrc->au32[2];
19131
19132 puDst->au32[3] = u32W2 ^ u32W0;
19133 puDst->au32[2] = u32W3 ^ u32W1;
19134 puDst->au32[1] = u32W4 ^ u32W2;
19135 puDst->au32[0] = u32W5 ^ u32W3;
19136}
19137
19138/**
19139 * SHA1MSG2
19140 */
19141IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19142{
19143 uint32_t u32W13 = puSrc->au32[2];
19144 uint32_t u32W14 = puSrc->au32[1];
19145 uint32_t u32W15 = puSrc->au32[0];
19146 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
19147 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
19148 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
19149 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
19150
19151 puDst->au32[3] = u32W16;
19152 puDst->au32[2] = u32W17;
19153 puDst->au32[1] = u32W18;
19154 puDst->au32[0] = u32W19;
19155}
19156
19157/**
19158 * SHA1RNDS4
19159 */
19160typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
19161typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
19162
19163static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19164{
19165 return (u32B & u32C) ^ (~u32B & u32D);
19166}
19167
19168static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19169{
19170 return u32B ^ u32C ^ u32D;
19171}
19172
19173static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19174{
19175 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
19176}
19177
19178static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19179{
19180 return u32B ^ u32C ^ u32D;
19181}
19182
19183IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19184{
19185 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
19186 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
19187
19188 uint32_t au32A[5];
19189 uint32_t au32B[5];
19190 uint32_t au32C[5];
19191 uint32_t au32D[5];
19192 uint32_t au32E[5];
19193 uint32_t au32W[4];
19194 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
19195 uint32_t u32K = s_au32K[bEvil & 0x3];
19196
19197 au32A[0] = puDst->au32[3];
19198 au32B[0] = puDst->au32[2];
19199 au32C[0] = puDst->au32[1];
19200 au32D[0] = puDst->au32[0];
19201 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
19202 au32W[i] = puSrc->au32[3 - i];
19203
19204 /* Round 0 is a bit different than the other rounds. */
19205 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
19206 au32B[1] = au32A[0];
19207 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
19208 au32D[1] = au32C[0];
19209 au32E[1] = au32D[0];
19210
19211 for (uint32_t i = 1; i <= 3; i++)
19212 {
19213 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
19214 au32B[i + 1] = au32A[i];
19215 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
19216 au32D[i + 1] = au32C[i];
19217 au32E[i + 1] = au32D[i];
19218 }
19219
19220 puDst->au32[3] = au32A[4];
19221 puDst->au32[2] = au32B[4];
19222 puDst->au32[1] = au32C[4];
19223 puDst->au32[0] = au32D[4];
19224}
19225
19226
19227/**
19228 * SHA256MSG1
19229 */
19230DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
19231{
19232 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
19233}
19234
19235IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19236{
19237 uint32_t u32W4 = puSrc->au32[0];
19238 uint32_t u32W3 = puDst->au32[3];
19239 uint32_t u32W2 = puDst->au32[2];
19240 uint32_t u32W1 = puDst->au32[1];
19241 uint32_t u32W0 = puDst->au32[0];
19242
19243 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
19244 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
19245 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
19246 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
19247}
19248
19249/**
19250 * SHA256MSG2
19251 */
19252DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
19253{
19254 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
19255}
19256
19257IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19258{
19259 uint32_t u32W14 = puSrc->au32[2];
19260 uint32_t u32W15 = puSrc->au32[3];
19261 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
19262 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
19263 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
19264 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
19265
19266 puDst->au32[3] = u32W19;
19267 puDst->au32[2] = u32W18;
19268 puDst->au32[1] = u32W17;
19269 puDst->au32[0] = u32W16;
19270}
19271
19272/**
19273 * SHA256RNDS2
19274 */
19275DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19276{
19277 return (u32X & u32Y) ^ (~u32X & u32Z);
19278}
19279
19280DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19281{
19282 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
19283}
19284
19285DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
19286{
19287 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
19288}
19289
19290DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
19291{
19292 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
19293}
19294
19295IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
19296{
19297 uint32_t au32A[3];
19298 uint32_t au32B[3];
19299 uint32_t au32C[3];
19300 uint32_t au32D[3];
19301 uint32_t au32E[3];
19302 uint32_t au32F[3];
19303 uint32_t au32G[3];
19304 uint32_t au32H[3];
19305 uint32_t au32WK[2];
19306
19307 au32A[0] = puSrc->au32[3];
19308 au32B[0] = puSrc->au32[2];
19309 au32C[0] = puDst->au32[3];
19310 au32D[0] = puDst->au32[2];
19311 au32E[0] = puSrc->au32[1];
19312 au32F[0] = puSrc->au32[0];
19313 au32G[0] = puDst->au32[1];
19314 au32H[0] = puDst->au32[0];
19315
19316 au32WK[0] = puXmm0Constants->au32[0];
19317 au32WK[1] = puXmm0Constants->au32[1];
19318
19319 for (uint32_t i = 0; i < 2; i++)
19320 {
19321 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19322 + iemAImpl_sha256_upper_sigma1(au32E[i])
19323 + au32WK[i]
19324 + au32H[i]
19325 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
19326 + iemAImpl_sha256_upper_sigma0(au32A[i]);
19327 au32B[i + 1] = au32A[i];
19328 au32C[i + 1] = au32B[i];
19329 au32D[i + 1] = au32C[i];
19330 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19331 + iemAImpl_sha256_upper_sigma1(au32E[i])
19332 + au32WK[i]
19333 + au32H[i]
19334 + au32D[i];
19335 au32F[i + 1] = au32E[i];
19336 au32G[i + 1] = au32F[i];
19337 au32H[i + 1] = au32G[i];
19338 }
19339
19340 puDst->au32[3] = au32A[2];
19341 puDst->au32[2] = au32B[2];
19342 puDst->au32[1] = au32E[2];
19343 puDst->au32[0] = au32F[2];
19344}
19345
19346
19347/**
19348 * ADCX
19349 */
19350#define ADX_EMIT(a_Flag, a_Type, a_Max) \
19351 do \
19352 { \
19353 bool f = RT_BOOL(fEFlags & (a_Flag)); \
19354 a_Type uTmp = *puDst + uSrc; \
19355 if (uTmp < uSrc) \
19356 fEFlags |= (a_Flag); \
19357 else \
19358 fEFlags &= ~(a_Flag); \
19359 if ( uTmp == a_Max \
19360 && f) \
19361 fEFlags |= (a_Flag); \
19362 if (f) \
19363 uTmp++; \
19364 *puDst = uTmp; \
19365 } \
19366 while (0)
19367
19368IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adcx_u32_fallback,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
19369{
19370 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19371 return fEFlags;
19372}
19373
19374IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adcx_u64_fallback,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
19375{
19376 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19377 return fEFlags;
19378}
19379
19380# if defined(IEM_WITHOUT_ASSEMBLY)
19381
19382IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adcx_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
19383{
19384 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19385 return fEFlags;
19386}
19387
19388IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adcx_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
19389{
19390 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19391 return fEFlags;
19392}
19393
19394#endif
19395
19396
19397/**
19398 * ADOX
19399 */
19400IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adox_u32_fallback,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
19401{
19402 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19403 return fEFlags;
19404}
19405
19406IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adox_u64_fallback,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
19407{
19408 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19409 return fEFlags;
19410}
19411
19412# if defined(IEM_WITHOUT_ASSEMBLY)
19413
19414IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adox_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
19415{
19416 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19417 return fEFlags;
19418}
19419
19420IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adox_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
19421{
19422 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19423 return fEFlags;
19424}
19425
19426# endif
19427
19428
19429/**
19430 * MPSADBW
19431 */
19432IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19433{
19434 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19435 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19436 int16_t ai16Src1[11];
19437 int16_t ai16Src2[4];
19438
19439 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19440 ai16Src1[i] = puDst->au8[idxSrc1 + i];
19441
19442 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19443 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
19444
19445 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19446 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19447 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19448 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19449 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19450}
19451
19452
19453IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
19454{
19455 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19456 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19457 int16_t ai16Src1[11];
19458 int16_t ai16Src2[4];
19459
19460 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19461 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
19462
19463 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19464 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
19465
19466 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19467 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19468 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19469 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19470 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19471}
19472
19473
19474IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
19475{
19476 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
19477 RTUINT256U const uSrc2 = *puSrc2;
19478 ASMCompilerBarrier();
19479 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
19480 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
19481}
19482
19483
19484/**
19485 * VPERM2I128
19486 */
19487IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19488{
19489 if (bImm & RT_BIT(3))
19490 {
19491 puDst->au64[0] = 0;
19492 puDst->au64[1] = 0;
19493 }
19494 else
19495 {
19496 switch (bImm & 0x3)
19497 {
19498 case 0:
19499 puDst->au64[0] = puSrc1->au64[0];
19500 puDst->au64[1] = puSrc1->au64[1];
19501 break;
19502 case 1:
19503 puDst->au64[0] = puSrc1->au64[2];
19504 puDst->au64[1] = puSrc1->au64[3];
19505 break;
19506 case 2:
19507 puDst->au64[0] = puSrc2->au64[0];
19508 puDst->au64[1] = puSrc2->au64[1];
19509 break;
19510 case 3:
19511 puDst->au64[0] = puSrc2->au64[2];
19512 puDst->au64[1] = puSrc2->au64[3];
19513 break;
19514 }
19515 }
19516
19517 if (bImm & RT_BIT(7))
19518 {
19519 puDst->au64[2] = 0;
19520 puDst->au64[3] = 0;
19521 }
19522 else
19523 {
19524 switch ((bImm >> 4) & 0x3)
19525 {
19526 case 0:
19527 puDst->au64[2] = puSrc1->au64[0];
19528 puDst->au64[3] = puSrc1->au64[1];
19529 break;
19530 case 1:
19531 puDst->au64[2] = puSrc1->au64[2];
19532 puDst->au64[3] = puSrc1->au64[3];
19533 break;
19534 case 2:
19535 puDst->au64[2] = puSrc2->au64[0];
19536 puDst->au64[3] = puSrc2->au64[1];
19537 break;
19538 case 3:
19539 puDst->au64[2] = puSrc2->au64[2];
19540 puDst->au64[3] = puSrc2->au64[3];
19541 break;
19542 }
19543 }
19544}
19545
19546
19547/**
19548 * VPERM2F128
19549 */
19550IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19551{
19552 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
19553}
19554
19555
19556/**
19557 * DPPS
19558 */
19559IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_dpps_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19560{
19561 RT_NOREF(puDst, pSrc, bImm);
19562 AssertReleaseFailed();
19563 return uMxCsrIn;
19564}
19565
19566
19567/**
19568 * DPPD
19569 */
19570IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_dppd_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19571{
19572 RT_NOREF(puDst, pSrc, bImm);
19573 AssertReleaseFailed();
19574 return uMxCsrIn;
19575}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette