VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 103853

Last change on this file since 103853 was 103745, checked in by vboxsync, 9 months ago

ValidationKit/bootsectors: add [v]psll[wdq] / [v]psra[wd] / [v]psrl[wdq] testcases for important corner conditions
VMM/IEM: fix vpsll[wdq] / vpsrl[wdq] emulation to actually pass corner case tests, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 741.1 KB
Line 
1/* $Id: IEMAllAImplC.cpp 103745 2024-03-09 12:28:08Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF+OF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We clear AF, as that seems to make the most sense and also seems
138 * to be the correct behavior on current CPUs.
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT32U g_ar32One[];
464extern const RTFLOAT80U g_ar80One[];
465extern const RTFLOAT80U g_r80Indefinite;
466extern const RTFLOAT32U g_ar32Infinity[];
467extern const RTFLOAT64U g_ar64Infinity[];
468extern const RTFLOAT80U g_ar80Infinity[];
469extern const RTFLOAT128U g_r128Ln2;
470extern const RTUINT128U g_u128Ln2Mantissa;
471extern const RTUINT128U g_u128Ln2MantissaIntel;
472extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
473extern const RTFLOAT32U g_ar32QNaN[];
474extern const RTFLOAT64U g_ar64QNaN[];
475
476/** Zero values (indexed by fSign). */
477RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
478RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
479RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
480
481/** One values (indexed by fSign). */
482RTFLOAT32U const g_ar32One[] =
483{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
484RTFLOAT80U const g_ar80One[] =
485{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
486
487/** Indefinite (negative). */
488RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
489
490/** Infinities (indexed by fSign). */
491RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
492RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
493RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
494
495/** Default QNaNs (indexed by fSign). */
496RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
497RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
498
499
500#if 0
501/** 128-bit floating point constant: 2.0 */
502const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
503#endif
504
505
506/* The next section is generated by tools/IEMGenFpuConstants: */
507
508/** The ln2 constant as 128-bit floating point value.
509 * base-10: 6.93147180559945309417232121458176575e-1
510 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
511 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
512//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
513const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
514/** High precision ln2 value.
515 * base-10: 6.931471805599453094172321214581765680747e-1
516 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
517 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
518const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
519/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
520 * base-10: 6.931471805599453094151379470289064954613e-1
521 * base-16: b.17217f7d1cf79abc0000000000000000@-1
522 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
523const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
524
525/** Horner constants for f2xm1 */
526const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
527{
528 /* a0
529 * base-10: 1.00000000000000000000000000000000000e0
530 * base-16: 1.0000000000000000000000000000@0
531 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
532 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
533 /* a1
534 * base-10: 5.00000000000000000000000000000000000e-1
535 * base-16: 8.0000000000000000000000000000@-1
536 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
537 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
538 /* a2
539 * base-10: 1.66666666666666666666666666666666658e-1
540 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
541 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
542 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
543 /* a3
544 * base-10: 4.16666666666666666666666666666666646e-2
545 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
546 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
547 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
548 /* a4
549 * base-10: 8.33333333333333333333333333333333323e-3
550 * base-16: 2.2222222222222222222222222222@-2
551 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
552 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
553 /* a5
554 * base-10: 1.38888888888888888888888888888888874e-3
555 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
556 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
557 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
558 /* a6
559 * base-10: 1.98412698412698412698412698412698412e-4
560 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
561 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
562 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
563 /* a7
564 * base-10: 2.48015873015873015873015873015873015e-5
565 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
566 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
567 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
568 /* a8
569 * base-10: 2.75573192239858906525573192239858902e-6
570 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
571 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
572 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
573 /* a9
574 * base-10: 2.75573192239858906525573192239858865e-7
575 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
576 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
577 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
578 /* a10
579 * base-10: 2.50521083854417187750521083854417184e-8
580 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
581 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
582 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
583 /* a11
584 * base-10: 2.08767569878680989792100903212014296e-9
585 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
586 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
587 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
588 /* a12
589 * base-10: 1.60590438368216145993923771701549472e-10
590 * base-16: b.092309d43684be51c198e91d7b40@-9
591 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
592 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
593 /* a13
594 * base-10: 1.14707455977297247138516979786821043e-11
595 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
596 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
597 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
598 /* a14
599 * base-10: 7.64716373181981647590113198578806964e-13
600 * base-16: d.73f9f399dc0f88ec32b587746578@-11
601 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
602 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
603 /* a15
604 * base-10: 4.77947733238738529743820749111754352e-14
605 * base-16: d.73f9f399dc0f88ec32b587746578@-12
606 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
607 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
608 /* a16
609 * base-10: 2.81145725434552076319894558301031970e-15
610 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
611 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
612 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
613 /* a17
614 * base-10: 1.56192069685862264622163643500573321e-16
615 * base-16: b.413c31dcbecbbdd8024435161550@-14
616 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
617 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
618 /* a18
619 * base-10: 8.22063524662432971695598123687227980e-18
620 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
621 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
622 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
623 /* a19
624 * base-10: 4.11031762331216485847799061843614006e-19
625 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
626 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
627 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
628 /* a20
629 * base-10: 1.95729410633912612308475743735054143e-20
630 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
631 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
632 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
633 /* a21
634 * base-10: 8.89679139245057328674889744250246106e-22
635 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
636 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
637 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
638};
639
640
641/*
642 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
643 * it all in C is probably safer atm., optimize what's necessary later, maybe.
644 */
645#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
646
647
648/*********************************************************************************************************************************
649* Binary Operations *
650*********************************************************************************************************************************/
651
652/*
653 * ADD
654 */
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
657{
658 uint64_t uDst = *puDst;
659 uint64_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
662}
663
664# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
665
666IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
667{
668 uint32_t uDst = *puDst;
669 uint32_t uResult = uDst + uSrc;
670 *puDst = uResult;
671 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
672}
673
674
675IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
676{
677 uint16_t uDst = *puDst;
678 uint16_t uResult = uDst + uSrc;
679 *puDst = uResult;
680 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
681}
682
683
684IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
685{
686 uint8_t uDst = *puDst;
687 uint8_t uResult = uDst + uSrc;
688 *puDst = uResult;
689 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
690}
691
692# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
693
694/*
695 * ADC
696 */
697
698IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
699{
700 if (!(*pfEFlags & X86_EFL_CF))
701 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
702 else
703 {
704 uint64_t uDst = *puDst;
705 uint64_t uResult = uDst + uSrc + 1;
706 *puDst = uResult;
707 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
708 }
709}
710
711# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint32_t uDst = *puDst;
720 uint32_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
723 }
724}
725
726
727IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
728{
729 if (!(*pfEFlags & X86_EFL_CF))
730 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
731 else
732 {
733 uint16_t uDst = *puDst;
734 uint16_t uResult = uDst + uSrc + 1;
735 *puDst = uResult;
736 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
737 }
738}
739
740
741IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
742{
743 if (!(*pfEFlags & X86_EFL_CF))
744 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
745 else
746 {
747 uint8_t uDst = *puDst;
748 uint8_t uResult = uDst + uSrc + 1;
749 *puDst = uResult;
750 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
751 }
752}
753
754# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
755
756/*
757 * SUB
758 */
759# if !defined(RT_ARCH_ARM64)
760
761IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
762{
763 uint64_t uDst = *puDst;
764 uint64_t uResult = uDst - uSrc;
765 *puDst = uResult;
766 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
767}
768
769# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
770
771IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
772{
773 uint32_t uDst = *puDst;
774 uint32_t uResult = uDst - uSrc;
775 *puDst = uResult;
776 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
777}
778
779
780IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
781{
782 uint16_t uDst = *puDst;
783 uint16_t uResult = uDst - uSrc;
784 *puDst = uResult;
785 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
786}
787
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
790{
791 uint8_t uDst = *puDst;
792 uint8_t uResult = uDst - uSrc;
793 *puDst = uResult;
794 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
795}
796
797# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
798# endif /* !RT_ARCH_ARM64 */
799
800/*
801 * SBB
802 */
803
804IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
805{
806 if (!(*pfEFlags & X86_EFL_CF))
807 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
808 else
809 {
810 uint64_t uDst = *puDst;
811 uint64_t uResult = uDst - uSrc - 1;
812 *puDst = uResult;
813 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
814 }
815}
816
817# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
818
819IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
820{
821 if (!(*pfEFlags & X86_EFL_CF))
822 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
823 else
824 {
825 uint32_t uDst = *puDst;
826 uint32_t uResult = uDst - uSrc - 1;
827 *puDst = uResult;
828 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
829 }
830}
831
832
833IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
834{
835 if (!(*pfEFlags & X86_EFL_CF))
836 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
837 else
838 {
839 uint16_t uDst = *puDst;
840 uint16_t uResult = uDst - uSrc - 1;
841 *puDst = uResult;
842 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
843 }
844}
845
846
847IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
848{
849 if (!(*pfEFlags & X86_EFL_CF))
850 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
851 else
852 {
853 uint8_t uDst = *puDst;
854 uint8_t uResult = uDst - uSrc - 1;
855 *puDst = uResult;
856 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
857 }
858}
859
860# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
861
862
863/*
864 * OR
865 */
866
867IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
868{
869 uint64_t uResult = *puDst | uSrc;
870 *puDst = uResult;
871 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
872}
873
874# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
875
876IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
877{
878 uint32_t uResult = *puDst | uSrc;
879 *puDst = uResult;
880 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
881}
882
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
885{
886 uint16_t uResult = *puDst | uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
893{
894 uint8_t uResult = *puDst | uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
897}
898
899# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
900
901/*
902 * XOR
903 */
904
905IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
906{
907 uint64_t uResult = *puDst ^ uSrc;
908 *puDst = uResult;
909 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
910}
911
912# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
913
914IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
915{
916 uint32_t uResult = *puDst ^ uSrc;
917 *puDst = uResult;
918 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
919}
920
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
923{
924 uint16_t uResult = *puDst ^ uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
931{
932 uint8_t uResult = *puDst ^ uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
935}
936
937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
938
939/*
940 * AND
941 */
942
943IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
944{
945 uint64_t const uResult = *puDst & uSrc;
946 *puDst = uResult;
947 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
948}
949
950# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
953{
954 uint32_t const uResult = *puDst & uSrc;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
961{
962 uint16_t const uResult = *puDst & uSrc;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
965}
966
967
968IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
969{
970 uint8_t const uResult = *puDst & uSrc;
971 *puDst = uResult;
972 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
973}
974
975# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
976#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
977
978/*
979 * ANDN (BMI1 instruction)
980 */
981
982IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
983{
984 uint64_t const uResult = ~uSrc1 & uSrc2;
985 *puDst = uResult;
986 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
987}
988
989
990IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
991{
992 uint32_t const uResult = ~uSrc1 & uSrc2;
993 *puDst = uResult;
994 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
995}
996
997
998#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
999IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1000{
1001 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1002}
1003#endif
1004
1005
1006#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1007IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1008{
1009 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1010}
1011#endif
1012
1013#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1014
1015/*
1016 * CMP
1017 */
1018
1019IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1020{
1021 uint64_t uDstTmp = *puDst;
1022 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1023}
1024
1025# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1026
1027IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1028{
1029 uint32_t uDstTmp = *puDst;
1030 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1031}
1032
1033
1034IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1035{
1036 uint16_t uDstTmp = *puDst;
1037 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1038}
1039
1040
1041IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1042{
1043 uint8_t uDstTmp = *puDst;
1044 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1045}
1046
1047# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1048
1049/*
1050 * TEST
1051 */
1052
1053IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1054{
1055 uint64_t uResult = *puDst & uSrc;
1056 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
1057}
1058
1059# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1060
1061IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1062{
1063 uint32_t uResult = *puDst & uSrc;
1064 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
1065}
1066
1067
1068IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1069{
1070 uint16_t uResult = *puDst & uSrc;
1071 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
1072}
1073
1074
1075IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1076{
1077 uint8_t uResult = *puDst & uSrc;
1078 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
1079}
1080
1081# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1082
1083
1084/*
1085 * LOCK prefixed variants of the above
1086 */
1087
1088/** 64-bit locked binary operand operation. */
1089# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1090 do { \
1091 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1092 uint ## a_cBitsWidth ## _t uTmp; \
1093 uint32_t fEflTmp; \
1094 do \
1095 { \
1096 uTmp = uOld; \
1097 fEflTmp = *pfEFlags; \
1098 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1099 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1100 *pfEFlags = fEflTmp; \
1101 } while (0)
1102
1103
1104#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1105 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1106 uint ## a_cBitsWidth ## _t uSrc, \
1107 uint32_t *pfEFlags)) \
1108 { \
1109 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1110 }
1111
1112EMIT_LOCKED_BIN_OP(add, 64)
1113EMIT_LOCKED_BIN_OP(adc, 64)
1114EMIT_LOCKED_BIN_OP(sub, 64)
1115EMIT_LOCKED_BIN_OP(sbb, 64)
1116EMIT_LOCKED_BIN_OP(or, 64)
1117EMIT_LOCKED_BIN_OP(xor, 64)
1118EMIT_LOCKED_BIN_OP(and, 64)
1119# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1120EMIT_LOCKED_BIN_OP(add, 32)
1121EMIT_LOCKED_BIN_OP(adc, 32)
1122EMIT_LOCKED_BIN_OP(sub, 32)
1123EMIT_LOCKED_BIN_OP(sbb, 32)
1124EMIT_LOCKED_BIN_OP(or, 32)
1125EMIT_LOCKED_BIN_OP(xor, 32)
1126EMIT_LOCKED_BIN_OP(and, 32)
1127
1128EMIT_LOCKED_BIN_OP(add, 16)
1129EMIT_LOCKED_BIN_OP(adc, 16)
1130EMIT_LOCKED_BIN_OP(sub, 16)
1131EMIT_LOCKED_BIN_OP(sbb, 16)
1132EMIT_LOCKED_BIN_OP(or, 16)
1133EMIT_LOCKED_BIN_OP(xor, 16)
1134EMIT_LOCKED_BIN_OP(and, 16)
1135
1136EMIT_LOCKED_BIN_OP(add, 8)
1137EMIT_LOCKED_BIN_OP(adc, 8)
1138EMIT_LOCKED_BIN_OP(sub, 8)
1139EMIT_LOCKED_BIN_OP(sbb, 8)
1140EMIT_LOCKED_BIN_OP(or, 8)
1141EMIT_LOCKED_BIN_OP(xor, 8)
1142EMIT_LOCKED_BIN_OP(and, 8)
1143# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1144
1145
1146/*
1147 * Bit operations (same signature as above).
1148 */
1149
1150/*
1151 * BT
1152 */
1153
1154IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1155{
1156 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1157 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1158 Assert(uSrc < 64);
1159 uint64_t uDst = *puDst;
1160 if (uDst & RT_BIT_64(uSrc))
1161 *pfEFlags |= X86_EFL_CF;
1162 else
1163 *pfEFlags &= ~X86_EFL_CF;
1164}
1165
1166# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 32);
1173 uint32_t uDst = *puDst;
1174 if (uDst & RT_BIT_32(uSrc))
1175 *pfEFlags |= X86_EFL_CF;
1176 else
1177 *pfEFlags &= ~X86_EFL_CF;
1178}
1179
1180IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1181{
1182 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1183 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1184 Assert(uSrc < 16);
1185 uint16_t uDst = *puDst;
1186 if (uDst & RT_BIT_32(uSrc))
1187 *pfEFlags |= X86_EFL_CF;
1188 else
1189 *pfEFlags &= ~X86_EFL_CF;
1190}
1191
1192# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1193
1194/*
1195 * BTC
1196 */
1197
1198IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1199{
1200 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1201 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1202 Assert(uSrc < 64);
1203 uint64_t fMask = RT_BIT_64(uSrc);
1204 uint64_t uDst = *puDst;
1205 if (uDst & fMask)
1206 {
1207 uDst &= ~fMask;
1208 *puDst = uDst;
1209 *pfEFlags |= X86_EFL_CF;
1210 }
1211 else
1212 {
1213 uDst |= fMask;
1214 *puDst = uDst;
1215 *pfEFlags &= ~X86_EFL_CF;
1216 }
1217}
1218
1219# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1220
1221IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1222{
1223 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1224 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1225 Assert(uSrc < 32);
1226 uint32_t fMask = RT_BIT_32(uSrc);
1227 uint32_t uDst = *puDst;
1228 if (uDst & fMask)
1229 {
1230 uDst &= ~fMask;
1231 *puDst = uDst;
1232 *pfEFlags |= X86_EFL_CF;
1233 }
1234 else
1235 {
1236 uDst |= fMask;
1237 *puDst = uDst;
1238 *pfEFlags &= ~X86_EFL_CF;
1239 }
1240}
1241
1242
1243IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1244{
1245 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1246 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1247 Assert(uSrc < 16);
1248 uint16_t fMask = RT_BIT_32(uSrc);
1249 uint16_t uDst = *puDst;
1250 if (uDst & fMask)
1251 {
1252 uDst &= ~fMask;
1253 *puDst = uDst;
1254 *pfEFlags |= X86_EFL_CF;
1255 }
1256 else
1257 {
1258 uDst |= fMask;
1259 *puDst = uDst;
1260 *pfEFlags &= ~X86_EFL_CF;
1261 }
1262}
1263
1264# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1265
1266/*
1267 * BTR
1268 */
1269
1270IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1271{
1272 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1273 logical operation (AND/OR/whatever). */
1274 Assert(uSrc < 64);
1275 uint64_t fMask = RT_BIT_64(uSrc);
1276 uint64_t uDst = *puDst;
1277 if (uDst & fMask)
1278 {
1279 uDst &= ~fMask;
1280 *puDst = uDst;
1281 *pfEFlags |= X86_EFL_CF;
1282 }
1283 else
1284 *pfEFlags &= ~X86_EFL_CF;
1285}
1286
1287# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1288
1289IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1290{
1291 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1292 logical operation (AND/OR/whatever). */
1293 Assert(uSrc < 32);
1294 uint32_t fMask = RT_BIT_32(uSrc);
1295 uint32_t uDst = *puDst;
1296 if (uDst & fMask)
1297 {
1298 uDst &= ~fMask;
1299 *puDst = uDst;
1300 *pfEFlags |= X86_EFL_CF;
1301 }
1302 else
1303 *pfEFlags &= ~X86_EFL_CF;
1304}
1305
1306
1307IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1308{
1309 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1310 logical operation (AND/OR/whatever). */
1311 Assert(uSrc < 16);
1312 uint16_t fMask = RT_BIT_32(uSrc);
1313 uint16_t uDst = *puDst;
1314 if (uDst & fMask)
1315 {
1316 uDst &= ~fMask;
1317 *puDst = uDst;
1318 *pfEFlags |= X86_EFL_CF;
1319 }
1320 else
1321 *pfEFlags &= ~X86_EFL_CF;
1322}
1323
1324# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1325
1326/*
1327 * BTS
1328 */
1329
1330IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1331{
1332 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1333 logical operation (AND/OR/whatever). */
1334 Assert(uSrc < 64);
1335 uint64_t fMask = RT_BIT_64(uSrc);
1336 uint64_t uDst = *puDst;
1337 if (uDst & fMask)
1338 *pfEFlags |= X86_EFL_CF;
1339 else
1340 {
1341 uDst |= fMask;
1342 *puDst = uDst;
1343 *pfEFlags &= ~X86_EFL_CF;
1344 }
1345}
1346
1347# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1348
1349IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1350{
1351 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1352 logical operation (AND/OR/whatever). */
1353 Assert(uSrc < 32);
1354 uint32_t fMask = RT_BIT_32(uSrc);
1355 uint32_t uDst = *puDst;
1356 if (uDst & fMask)
1357 *pfEFlags |= X86_EFL_CF;
1358 else
1359 {
1360 uDst |= fMask;
1361 *puDst = uDst;
1362 *pfEFlags &= ~X86_EFL_CF;
1363 }
1364}
1365
1366
1367IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1368{
1369 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1370 logical operation (AND/OR/whatever). */
1371 Assert(uSrc < 16);
1372 uint16_t fMask = RT_BIT_32(uSrc);
1373 uint32_t uDst = *puDst;
1374 if (uDst & fMask)
1375 *pfEFlags |= X86_EFL_CF;
1376 else
1377 {
1378 uDst |= fMask;
1379 *puDst = uDst;
1380 *pfEFlags &= ~X86_EFL_CF;
1381 }
1382}
1383
1384# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1385
1386
1387EMIT_LOCKED_BIN_OP(btc, 64)
1388EMIT_LOCKED_BIN_OP(btr, 64)
1389EMIT_LOCKED_BIN_OP(bts, 64)
1390# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1391EMIT_LOCKED_BIN_OP(btc, 32)
1392EMIT_LOCKED_BIN_OP(btr, 32)
1393EMIT_LOCKED_BIN_OP(bts, 32)
1394
1395EMIT_LOCKED_BIN_OP(btc, 16)
1396EMIT_LOCKED_BIN_OP(btr, 16)
1397EMIT_LOCKED_BIN_OP(bts, 16)
1398# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1399
1400
1401/*
1402 * Helpers for BSR and BSF.
1403 *
1404 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1405 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1406 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1407 * but we restrict ourselves to emulating these recent marchs.
1408 */
1409#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1410 unsigned iBit = (a_iBit); \
1411 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1412 if (iBit) \
1413 { \
1414 *puDst = --iBit; \
1415 fEfl |= g_afParity[iBit]; \
1416 } \
1417 else \
1418 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1419 *pfEFlags = fEfl; \
1420 } while (0)
1421#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1422 unsigned const iBit = (a_iBit); \
1423 if (iBit) \
1424 { \
1425 *puDst = iBit - 1; \
1426 *pfEFlags &= ~X86_EFL_ZF; \
1427 } \
1428 else \
1429 *pfEFlags |= X86_EFL_ZF; \
1430 } while (0)
1431
1432
1433/*
1434 * BSF - first (least significant) bit set
1435 */
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1447{
1448 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1449}
1450
1451# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1464{
1465 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1466}
1467
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1480{
1481 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1482}
1483
1484# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1485
1486
1487/*
1488 * BSR - last (most significant) bit set
1489 */
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1501{
1502 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1503}
1504
1505# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1518{
1519 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1520}
1521
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1534{
1535 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1536}
1537
1538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1539
1540
1541/*
1542 * Helpers for LZCNT and TZCNT.
1543 */
1544#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1545 unsigned const uResult = (a_uResult); \
1546 *(a_puDst) = uResult; \
1547 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1548 if (uResult) \
1549 fEfl |= g_afParity[uResult]; \
1550 else \
1551 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1552 if (!a_uSrc) \
1553 fEfl |= X86_EFL_CF; \
1554 *(a_pfEFlags) = fEfl; \
1555 } while (0)
1556#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1557 unsigned const uResult = (a_uResult); \
1558 *(a_puDst) = uResult; \
1559 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1560 if (!uResult) \
1561 fEfl |= X86_EFL_ZF; \
1562 if (!a_uSrc) \
1563 fEfl |= X86_EFL_CF; \
1564 *(a_pfEFlags) = fEfl; \
1565 } while (0)
1566
1567
1568/*
1569 * LZCNT - count leading zero bits.
1570 */
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1582{
1583 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1584}
1585
1586# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1599{
1600 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1601}
1602
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1615{
1616 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1617}
1618
1619# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1620
1621
1622/*
1623 * TZCNT - count leading zero bits.
1624 */
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1636{
1637 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1638}
1639
1640# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1653{
1654 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1655}
1656
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1669{
1670 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1671}
1672
1673# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1674#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1675
1676/*
1677 * BEXTR (BMI1 instruction)
1678 */
1679#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1680IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1681 a_Type uSrc2, uint32_t *pfEFlags)) \
1682{ \
1683 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1684 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1685 a_Type uResult; \
1686 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1687 if (iFirstBit < a_cBits) \
1688 { \
1689 uResult = uSrc1 >> iFirstBit; \
1690 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1691 if (cBits < a_cBits) \
1692 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1693 *puDst = uResult; \
1694 if (!uResult) \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 else \
1698 { \
1699 *puDst = uResult = 0; \
1700 fEfl |= X86_EFL_ZF; \
1701 } \
1702 /** @todo complete flag calculations. */ \
1703 *pfEFlags = fEfl; \
1704}
1705
1706EMIT_BEXTR(64, uint64_t, _fallback)
1707EMIT_BEXTR(32, uint32_t, _fallback)
1708#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1709EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1710#endif
1711#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1712EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1713#endif
1714
1715/*
1716 * BLSR (BMI1 instruction)
1717 */
1718#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1719IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1720{ \
1721 uint32_t fEfl1 = *pfEFlags; \
1722 uint32_t fEfl2 = fEfl1; \
1723 *puDst = uSrc; \
1724 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1725 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1726 \
1727 /* AMD: The carry flag is from the SUB operation. */ \
1728 /* 10890xe: PF always cleared? */ \
1729 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1730 fEfl2 |= fEfl1 & X86_EFL_CF; \
1731 *pfEFlags = fEfl2; \
1732}
1733
1734EMIT_BLSR(64, uint64_t, _fallback)
1735EMIT_BLSR(32, uint32_t, _fallback)
1736#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1737EMIT_BLSR(64, uint64_t, RT_NOTHING)
1738#endif
1739#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1740EMIT_BLSR(32, uint32_t, RT_NOTHING)
1741#endif
1742
1743/*
1744 * BLSMSK (BMI1 instruction)
1745 */
1746#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1747IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1748{ \
1749 uint32_t fEfl1 = *pfEFlags; \
1750 uint32_t fEfl2 = fEfl1; \
1751 *puDst = uSrc; \
1752 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1753 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1754 \
1755 /* AMD: The carry flag is from the SUB operation. */ \
1756 /* 10890xe: PF always cleared? */ \
1757 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1758 fEfl2 |= fEfl1 & X86_EFL_CF; \
1759 *pfEFlags = fEfl2; \
1760}
1761
1762EMIT_BLSMSK(64, uint64_t, _fallback)
1763EMIT_BLSMSK(32, uint32_t, _fallback)
1764#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1765EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1766#endif
1767#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1768EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1769#endif
1770
1771/*
1772 * BLSI (BMI1 instruction)
1773 */
1774#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1775IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1776{ \
1777 uint32_t fEfl1 = *pfEFlags; \
1778 uint32_t fEfl2 = fEfl1; \
1779 *puDst = uSrc; \
1780 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1781 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1782 \
1783 /* AMD: The carry flag is from the SUB operation. */ \
1784 /* 10890xe: PF always cleared? */ \
1785 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1786 fEfl2 |= fEfl1 & X86_EFL_CF; \
1787 *pfEFlags = fEfl2; \
1788}
1789
1790EMIT_BLSI(64, uint64_t, _fallback)
1791EMIT_BLSI(32, uint32_t, _fallback)
1792#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1793EMIT_BLSI(64, uint64_t, RT_NOTHING)
1794#endif
1795#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1796EMIT_BLSI(32, uint32_t, RT_NOTHING)
1797#endif
1798
1799/*
1800 * BZHI (BMI2 instruction)
1801 */
1802#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1803IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1804 a_Type uSrc2, uint32_t *pfEFlags)) \
1805{ \
1806 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1807 a_Type uResult; \
1808 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1809 if (iFirstBit < a_cBits) \
1810 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1811 else \
1812 { \
1813 uResult = uSrc1; \
1814 fEfl |= X86_EFL_CF; \
1815 } \
1816 *puDst = uResult; \
1817 fEfl |= X86_EFL_CALC_ZF(uResult); \
1818 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1819 *pfEFlags = fEfl; \
1820}
1821
1822EMIT_BZHI(64, uint64_t, _fallback)
1823EMIT_BZHI(32, uint32_t, _fallback)
1824#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1825EMIT_BZHI(64, uint64_t, RT_NOTHING)
1826#endif
1827#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1828EMIT_BZHI(32, uint32_t, RT_NOTHING)
1829#endif
1830
1831/*
1832 * POPCNT
1833 */
1834RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1835{
1836 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1837 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1838 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1839 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1840};
1841
1842/** @todo Use native popcount where possible and employ some more efficient
1843 * algorithm here (or in asm.h fallback)! */
1844
1845DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1846{
1847 return g_abBitCounts6[ u16 & 0x3f]
1848 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1849 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1850}
1851
1852DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1853{
1854 return g_abBitCounts6[ u32 & 0x3f]
1855 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1856 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1857 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1858 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1859 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1860}
1861
1862DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1863{
1864 return g_abBitCounts6[ u64 & 0x3f]
1865 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1870 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1871 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1872 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1873 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1874 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1875}
1876
1877#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1878IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1879{ \
1880 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1881 a_Type uResult; \
1882 if (uSrc) \
1883 uResult = iemPopCountU ## a_cBits(uSrc); \
1884 else \
1885 { \
1886 fEfl |= X86_EFL_ZF; \
1887 uResult = 0; \
1888 } \
1889 *puDst = uResult; \
1890 *pfEFlags = fEfl; \
1891}
1892
1893EMIT_POPCNT(64, uint64_t, _fallback)
1894EMIT_POPCNT(32, uint32_t, _fallback)
1895EMIT_POPCNT(16, uint16_t, _fallback)
1896#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1897EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1898#endif
1899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1900EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1901EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1902#endif
1903
1904
1905#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1906
1907/*
1908 * XCHG
1909 */
1910
1911IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1912{
1913#if ARCH_BITS >= 64
1914 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1915#else
1916 uint64_t uOldMem = *puMem;
1917 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1918 ASMNopPause();
1919 *puReg = uOldMem;
1920#endif
1921}
1922
1923# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1924
1925IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1926{
1927 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1928}
1929
1930
1931IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1932{
1933 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1934}
1935
1936
1937IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1938{
1939 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1940}
1941
1942# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1943
1944
1945/* Unlocked variants for fDisregardLock mode: */
1946
1947IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1948{
1949 uint64_t const uOld = *puMem;
1950 *puMem = *puReg;
1951 *puReg = uOld;
1952}
1953
1954# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1955
1956IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1957{
1958 uint32_t const uOld = *puMem;
1959 *puMem = *puReg;
1960 *puReg = uOld;
1961}
1962
1963
1964IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1965{
1966 uint16_t const uOld = *puMem;
1967 *puMem = *puReg;
1968 *puReg = uOld;
1969}
1970
1971
1972IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1973{
1974 uint8_t const uOld = *puMem;
1975 *puMem = *puReg;
1976 *puReg = uOld;
1977}
1978
1979# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1980
1981
1982/*
1983 * XADD and LOCK XADD.
1984 */
1985#define EMIT_XADD(a_cBitsWidth, a_Type) \
1986IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1987{ \
1988 a_Type uDst = *puDst; \
1989 a_Type uResult = uDst; \
1990 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1991 *puDst = uResult; \
1992 *puReg = uDst; \
1993} \
1994\
1995IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1996{ \
1997 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1998 a_Type uResult; \
1999 uint32_t fEflTmp; \
2000 do \
2001 { \
2002 uResult = uOld; \
2003 fEflTmp = *pfEFlags; \
2004 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2005 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2006 *puReg = uOld; \
2007 *pfEFlags = fEflTmp; \
2008}
2009EMIT_XADD(64, uint64_t)
2010# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2011EMIT_XADD(32, uint32_t)
2012EMIT_XADD(16, uint16_t)
2013EMIT_XADD(8, uint8_t)
2014# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2015
2016#endif
2017
2018/*
2019 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2020 *
2021 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2022 * instructions are emulated as locked.
2023 */
2024#if defined(IEM_WITHOUT_ASSEMBLY)
2025
2026IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2027{
2028 uint8_t uOld = *puAl;
2029 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2030 Assert(*puAl == uOld);
2031 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2032}
2033
2034
2035IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2036{
2037 uint16_t uOld = *puAx;
2038 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2039 Assert(*puAx == uOld);
2040 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2041}
2042
2043
2044IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2045{
2046 uint32_t uOld = *puEax;
2047 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2048 Assert(*puEax == uOld);
2049 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2050}
2051
2052
2053# if ARCH_BITS == 32
2054IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2055# else
2056IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2057# endif
2058{
2059# if ARCH_BITS == 32
2060 uint64_t const uSrcReg = *puSrcReg;
2061# endif
2062 uint64_t uOld = *puRax;
2063 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2064 Assert(*puRax == uOld);
2065 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2066}
2067
2068
2069IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2070 uint32_t *pEFlags))
2071{
2072 uint64_t const uNew = pu64EbxEcx->u;
2073 uint64_t const uOld = pu64EaxEdx->u;
2074 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2075 {
2076 Assert(pu64EaxEdx->u == uOld);
2077 *pEFlags |= X86_EFL_ZF;
2078 }
2079 else
2080 *pEFlags &= ~X86_EFL_ZF;
2081}
2082
2083
2084# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2085IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2086 uint32_t *pEFlags))
2087{
2088# ifdef VBOX_STRICT
2089 RTUINT128U const uOld = *pu128RaxRdx;
2090# endif
2091# if defined(RT_ARCH_AMD64)
2092 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2093 &pu128RaxRdx->u))
2094# else
2095 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2096# endif
2097 {
2098 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2099 *pEFlags |= X86_EFL_ZF;
2100 }
2101 else
2102 *pEFlags &= ~X86_EFL_ZF;
2103}
2104# endif
2105
2106#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2107
2108# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2109IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2110 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2111{
2112 RTUINT128U u128Tmp = *pu128Dst;
2113 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2114 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2115 {
2116 *pu128Dst = *pu128RbxRcx;
2117 *pEFlags |= X86_EFL_ZF;
2118 }
2119 else
2120 {
2121 *pu128RaxRdx = u128Tmp;
2122 *pEFlags &= ~X86_EFL_ZF;
2123 }
2124}
2125#endif /* !RT_ARCH_ARM64 */
2126
2127#if defined(IEM_WITHOUT_ASSEMBLY)
2128
2129/* Unlocked versions mapped to the locked ones: */
2130
2131IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2132{
2133 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2134}
2135
2136
2137IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2138{
2139# if 0
2140 /* If correctly aligned, used the locked variation. */
2141 if (!((uintptr_t)pu16Dst & 1))
2142 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2143 else
2144# endif
2145 {
2146 /* Otherwise emulate it as best as we can. */
2147 uint16_t const uOld = *puAx;
2148 uint16_t const uDst = *pu16Dst;
2149 if (uOld == uDst)
2150 {
2151 *pu16Dst = uSrcReg;
2152 iemAImpl_cmp_u16(&uOld, uOld, pEFlags);
2153 }
2154 else
2155 {
2156 *puAx = uDst;
2157 iemAImpl_cmp_u16(&uOld, uDst, pEFlags);
2158 }
2159 }
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2164{
2165# if 0
2166 /* If correctly aligned, used the locked variation. */
2167 if (!((uintptr_t)pu32Dst & 3))
2168 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2169 else
2170# endif
2171 {
2172 /* Otherwise emulate it as best as we can. */
2173 uint32_t const uOld = *puEax;
2174 uint32_t const uDst = *pu32Dst;
2175 if (uOld == uDst)
2176 {
2177 *pu32Dst = uSrcReg;
2178 iemAImpl_cmp_u32(&uOld, uOld, pEFlags);
2179 }
2180 else
2181 {
2182 *puEax = uDst;
2183 iemAImpl_cmp_u32(&uOld, uDst, pEFlags);
2184 }
2185 }
2186}
2187
2188
2189# if ARCH_BITS == 32
2190IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2191{
2192# if 0
2193 /* If correctly aligned, used the locked variation. */
2194 if (!((uintptr_t)pu32Dst & 7))
2195 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2196 else
2197# endif
2198 {
2199 /* Otherwise emulate it as best as we can. */
2200 uint64_t const uOld = *puRax;
2201 uint64_t const uSrc = *puSrcReg;
2202 uint64_t const uDst = *pu64Dst;
2203 if (uOld == uDst)
2204 {
2205 *pu64Dst = uSrc;
2206 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2207 }
2208 else
2209 {
2210 *puRax = uDst;
2211 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2212 }
2213 }
2214}
2215# else /* ARCH_BITS != 32 */
2216IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2217{
2218# if 0
2219 /* If correctly aligned, used the locked variation. */
2220 if (!((uintptr_t)pu64Dst & 7))
2221 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2222 else
2223# endif
2224 {
2225 /* Otherwise emulate it as best as we can. */
2226 uint64_t const uOld = *puRax;
2227 uint64_t const uDst = *pu64Dst;
2228 if (uOld == uDst)
2229 {
2230 *pu64Dst = uSrcReg;
2231 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2232 }
2233 else
2234 {
2235 *puRax = uDst;
2236 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2237 }
2238 }
2239}
2240# endif /* ARCH_BITS != 32 */
2241
2242
2243IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2244{
2245# if 0
2246 /* If correctly aligned, used the locked variation. */
2247 if (!((uintptr_t)pu64Dst & 7))
2248 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2249 else
2250# endif
2251 {
2252 /* Otherwise emulate it as best as we can. */
2253 uint64_t const uNew = pu64EbxEcx->u;
2254 uint64_t const uOld = pu64EaxEdx->u;
2255 uint64_t const uDst = *pu64Dst;
2256 if (uDst == uOld)
2257 {
2258 *pu64Dst = uNew;
2259 *pEFlags |= X86_EFL_ZF;
2260 }
2261 else
2262 {
2263 pu64EaxEdx->u = uDst;
2264 *pEFlags &= ~X86_EFL_ZF;
2265 }
2266 }
2267}
2268
2269
2270IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2271 uint32_t *pEFlags))
2272{
2273# if 0
2274 /* If correctly aligned, used the locked variation. */
2275 if (!((uintptr_t)pu64Dst & 15))
2276 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2277 else
2278# endif
2279 {
2280 /* Otherwise emulate it as best as we can. */
2281# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2282 uint128_t const uNew = pu128RbxRcx->u;
2283 uint128_t const uOld = pu128RaxRdx->u;
2284 uint128_t const uDst = pu128Dst->u;
2285 if (uDst == uOld)
2286 {
2287 pu128Dst->u = uNew;
2288 *pEFlags |= X86_EFL_ZF;
2289 }
2290 else
2291 {
2292 pu128RaxRdx->u = uDst;
2293 *pEFlags &= ~X86_EFL_ZF;
2294 }
2295# else
2296 RTUINT128U const uNew = *pu128RbxRcx;
2297 RTUINT128U const uOld = *pu128RaxRdx;
2298 RTUINT128U const uDst = *pu128Dst;
2299 if ( uDst.s.Lo == uOld.s.Lo
2300 && uDst.s.Hi == uOld.s.Hi)
2301 {
2302 *pu128Dst = uNew;
2303 *pEFlags |= X86_EFL_ZF;
2304 }
2305 else
2306 {
2307 *pu128RaxRdx = uDst;
2308 *pEFlags &= ~X86_EFL_ZF;
2309 }
2310# endif
2311 }
2312}
2313
2314#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2315
2316#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2317 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2318
2319/*
2320 * MUL, IMUL, DIV and IDIV helpers.
2321 *
2322 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2323 * division step so we can select between using C operators and
2324 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2325 *
2326 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2327 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2328 * input loads and the result storing.
2329 */
2330
2331DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2332{
2333# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2334 pQuotient->s.Lo = 0;
2335 pQuotient->s.Hi = 0;
2336# endif
2337 RTUINT128U Divisor;
2338 Divisor.s.Lo = u64Divisor;
2339 Divisor.s.Hi = 0;
2340 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2341}
2342
2343# define DIV_LOAD(a_Dividend) \
2344 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2345# define DIV_LOAD_U8(a_Dividend) \
2346 a_Dividend.u = *puAX
2347
2348# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2349# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2350
2351# define MUL_LOAD_F1() *puA
2352# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2353
2354# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2355# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2356
2357# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2358 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2359# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2360 RTUInt128AssignNeg(&(a_Value))
2361
2362# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2363 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2364# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2365 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2366
2367# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2368 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2369 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2370# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2371 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2372
2373
2374/*
2375 * MUL
2376 */
2377# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2378IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2379{ \
2380 RTUINT ## a_cBitsWidth2x ## U Result; \
2381 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2382 a_fnStore(Result); \
2383 \
2384 /* Calc EFLAGS: */ \
2385 uint32_t fEfl = *pfEFlags; \
2386 if (a_fIntelFlags) \
2387 { /* Intel: 6700K and 10980XE behavior */ \
2388 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2389 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2390 fEfl |= X86_EFL_SF; \
2391 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2392 if (Result.s.Hi != 0) \
2393 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2394 } \
2395 else \
2396 { /* AMD: 3990X */ \
2397 if (Result.s.Hi != 0) \
2398 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2399 else \
2400 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2401 } \
2402 *pfEFlags = fEfl; \
2403 return 0; \
2404} \
2405
2406# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2407 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2408 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2409 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2410
2411# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2412EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2413 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2414# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2415EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2416 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2417EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2418 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2419EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2420 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2421# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2422# endif /* !DOXYGEN_RUNNING */
2423
2424/*
2425 * MULX
2426 */
2427# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2428IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2429 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2430{ \
2431 RTUINT ## a_cBitsWidth2x ## U Result; \
2432 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2433 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2434 *puDst1 = Result.s.Hi; \
2435} \
2436
2437# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2438EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2439EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2440# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2441EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2442EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2443# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2444# endif /* !DOXYGEN_RUNNING */
2445
2446
2447/*
2448 * IMUL
2449 *
2450 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2451 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2452 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2453 */
2454# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2455 a_Suffix, a_fIntelFlags) \
2456IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2457{ \
2458 RTUINT ## a_cBitsWidth2x ## U Result; \
2459 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2460 \
2461 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2462 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2463 { \
2464 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2465 { \
2466 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2467 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2468 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2469 } \
2470 else \
2471 { \
2472 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2473 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2474 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2475 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2476 a_fnNeg(Result, a_cBitsWidth2x); \
2477 } \
2478 } \
2479 else \
2480 { \
2481 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2482 { \
2483 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2484 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2485 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2486 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2487 a_fnNeg(Result, a_cBitsWidth2x); \
2488 } \
2489 else \
2490 { \
2491 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2492 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2493 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2494 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2495 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2496 } \
2497 } \
2498 a_fnStore(Result); \
2499 \
2500 if (a_fIntelFlags) \
2501 { \
2502 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2503 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2504 fEfl |= X86_EFL_SF; \
2505 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2506 } \
2507 *pfEFlags = fEfl; \
2508 return 0; \
2509}
2510# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2511 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2512 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2513 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2514
2515# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2516EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2517 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2518# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2519EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2520 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2521EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2522 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2523EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2524 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2525# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2526# endif /* !DOXYGEN_RUNNING */
2527
2528
2529/*
2530 * IMUL with two operands are mapped onto the three operand variant, ignoring
2531 * the high part of the product.
2532 */
2533# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2534IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2535{ \
2536 a_uType uIgn; \
2537 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2538} \
2539\
2540IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2541{ \
2542 a_uType uIgn; \
2543 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2544} \
2545\
2546IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2547{ \
2548 a_uType uIgn; \
2549 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2550}
2551
2552EMIT_IMUL_TWO(64, uint64_t)
2553# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2554EMIT_IMUL_TWO(32, uint32_t)
2555EMIT_IMUL_TWO(16, uint16_t)
2556# endif
2557
2558
2559/*
2560 * DIV
2561 */
2562# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2563 a_Suffix, a_fIntelFlags) \
2564IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2565{ \
2566 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2567 a_fnLoad(Dividend); \
2568 if ( uDivisor != 0 \
2569 && Dividend.s.Hi < uDivisor) \
2570 { \
2571 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2572 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2573 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2574 \
2575 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2576 if (!a_fIntelFlags) \
2577 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2578 return 0; \
2579 } \
2580 /* #DE */ \
2581 return -1; \
2582}
2583# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2584 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2585 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2586 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2587
2588# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2589EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2590 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2591# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2592EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2593 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2594EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2595 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2596EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2597 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2598# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2599# endif /* !DOXYGEN_RUNNING */
2600
2601
2602/*
2603 * IDIV
2604 *
2605 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2606 * set AF and clear PF, ZF and SF just like it does for DIV.
2607 *
2608 */
2609# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2610 a_Suffix, a_fIntelFlags) \
2611IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2612{ \
2613 /* Note! Skylake leaves all flags alone. */ \
2614 \
2615 /** @todo overflow checks */ \
2616 if (uDivisor != 0) \
2617 { \
2618 /* \
2619 * Convert to unsigned division. \
2620 */ \
2621 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2622 a_fnLoad(Dividend); \
2623 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2624 if (fSignedDividend) \
2625 a_fnNeg(Dividend, a_cBitsWidth2x); \
2626 \
2627 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2628 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2629 uDivisorPositive = uDivisor; \
2630 else \
2631 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2632 \
2633 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2634 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2635 \
2636 /* \
2637 * Setup the result, checking for overflows. \
2638 */ \
2639 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2640 { \
2641 if (!fSignedDividend) \
2642 { \
2643 /* Positive divisor, positive dividend => result positive. */ \
2644 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2645 { \
2646 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2647 if (!a_fIntelFlags) \
2648 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2649 return 0; \
2650 } \
2651 } \
2652 else \
2653 { \
2654 /* Positive divisor, negative dividend => result negative. */ \
2655 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2656 { \
2657 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2658 if (!a_fIntelFlags) \
2659 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2660 return 0; \
2661 } \
2662 } \
2663 } \
2664 else \
2665 { \
2666 if (!fSignedDividend) \
2667 { \
2668 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2669 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2670 { \
2671 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2672 if (!a_fIntelFlags) \
2673 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2674 return 0; \
2675 } \
2676 } \
2677 else \
2678 { \
2679 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2680 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2681 { \
2682 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2683 if (!a_fIntelFlags) \
2684 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2685 return 0; \
2686 } \
2687 } \
2688 } \
2689 } \
2690 /* #DE */ \
2691 return -1; \
2692}
2693# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2694 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2695 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2696 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2697
2698# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2699EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2700 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2701# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2702EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2703 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2704EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2705 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2706EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2707 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2708# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2709# endif /* !DOXYGEN_RUNNING */
2710
2711#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2712
2713
2714/*********************************************************************************************************************************
2715* Unary operations. *
2716*********************************************************************************************************************************/
2717#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2718
2719/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2720 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2721 *
2722 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2723 * borrowing in arithmetic loops on intel 8008).
2724 *
2725 * @returns Status bits.
2726 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2727 * @param a_uResult Unsigned result value.
2728 * @param a_uDst The original destination value (for AF calc).
2729 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2730 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2731 */
2732#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2733 do { \
2734 uint32_t fEflTmp = *(a_pfEFlags); \
2735 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2736 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2737 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2738 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2739 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2740 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2741 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2742 *(a_pfEFlags) = fEflTmp; \
2743 } while (0)
2744
2745/*
2746 * INC
2747 */
2748
2749IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2750{
2751 uint64_t uDst = *puDst;
2752 uint64_t uResult = uDst + 1;
2753 *puDst = uResult;
2754 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2755}
2756
2757# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2758
2759IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2760{
2761 uint32_t uDst = *puDst;
2762 uint32_t uResult = uDst + 1;
2763 *puDst = uResult;
2764 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2765}
2766
2767
2768IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2769{
2770 uint16_t uDst = *puDst;
2771 uint16_t uResult = uDst + 1;
2772 *puDst = uResult;
2773 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2774}
2775
2776IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2777{
2778 uint8_t uDst = *puDst;
2779 uint8_t uResult = uDst + 1;
2780 *puDst = uResult;
2781 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2782}
2783
2784# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2785
2786
2787/*
2788 * DEC
2789 */
2790
2791IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2792{
2793 uint64_t uDst = *puDst;
2794 uint64_t uResult = uDst - 1;
2795 *puDst = uResult;
2796 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2797}
2798
2799# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2800
2801IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2802{
2803 uint32_t uDst = *puDst;
2804 uint32_t uResult = uDst - 1;
2805 *puDst = uResult;
2806 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2807}
2808
2809
2810IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2811{
2812 uint16_t uDst = *puDst;
2813 uint16_t uResult = uDst - 1;
2814 *puDst = uResult;
2815 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2816}
2817
2818
2819IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2820{
2821 uint8_t uDst = *puDst;
2822 uint8_t uResult = uDst - 1;
2823 *puDst = uResult;
2824 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2825}
2826
2827# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2828
2829
2830/*
2831 * NOT
2832 */
2833
2834IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2835{
2836 uint64_t uDst = *puDst;
2837 uint64_t uResult = ~uDst;
2838 *puDst = uResult;
2839 /* EFLAGS are not modified. */
2840 RT_NOREF_PV(pfEFlags);
2841}
2842
2843# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2844
2845IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2846{
2847 uint32_t uDst = *puDst;
2848 uint32_t uResult = ~uDst;
2849 *puDst = uResult;
2850 /* EFLAGS are not modified. */
2851 RT_NOREF_PV(pfEFlags);
2852}
2853
2854IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2855{
2856 uint16_t uDst = *puDst;
2857 uint16_t uResult = ~uDst;
2858 *puDst = uResult;
2859 /* EFLAGS are not modified. */
2860 RT_NOREF_PV(pfEFlags);
2861}
2862
2863IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2864{
2865 uint8_t uDst = *puDst;
2866 uint8_t uResult = ~uDst;
2867 *puDst = uResult;
2868 /* EFLAGS are not modified. */
2869 RT_NOREF_PV(pfEFlags);
2870}
2871
2872# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2873
2874
2875/*
2876 * NEG
2877 */
2878
2879/**
2880 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2881 *
2882 * @returns Status bits.
2883 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2884 * @param a_uResult Unsigned result value.
2885 * @param a_uDst The original destination value (for AF calc).
2886 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2887 */
2888#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2889 do { \
2890 uint32_t fEflTmp = *(a_pfEFlags); \
2891 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2892 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2893 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2894 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2895 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2896 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2897 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2898 *(a_pfEFlags) = fEflTmp; \
2899 } while (0)
2900
2901IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2902{
2903 uint64_t uDst = *puDst;
2904 uint64_t uResult = (uint64_t)0 - uDst;
2905 *puDst = uResult;
2906 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2907}
2908
2909# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2910
2911IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2912{
2913 uint32_t uDst = *puDst;
2914 uint32_t uResult = (uint32_t)0 - uDst;
2915 *puDst = uResult;
2916 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2917}
2918
2919
2920IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2921{
2922 uint16_t uDst = *puDst;
2923 uint16_t uResult = (uint16_t)0 - uDst;
2924 *puDst = uResult;
2925 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2926}
2927
2928
2929IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2930{
2931 uint8_t uDst = *puDst;
2932 uint8_t uResult = (uint8_t)0 - uDst;
2933 *puDst = uResult;
2934 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2935}
2936
2937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2938
2939/*
2940 * Locked variants.
2941 */
2942
2943/** Emit a function for doing a locked unary operand operation. */
2944# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2945 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2946 uint32_t *pfEFlags)) \
2947 { \
2948 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2949 uint ## a_cBitsWidth ## _t uTmp; \
2950 uint32_t fEflTmp; \
2951 do \
2952 { \
2953 uTmp = uOld; \
2954 fEflTmp = *pfEFlags; \
2955 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2956 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2957 *pfEFlags = fEflTmp; \
2958 }
2959
2960EMIT_LOCKED_UNARY_OP(inc, 64)
2961EMIT_LOCKED_UNARY_OP(dec, 64)
2962EMIT_LOCKED_UNARY_OP(not, 64)
2963EMIT_LOCKED_UNARY_OP(neg, 64)
2964# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2965EMIT_LOCKED_UNARY_OP(inc, 32)
2966EMIT_LOCKED_UNARY_OP(dec, 32)
2967EMIT_LOCKED_UNARY_OP(not, 32)
2968EMIT_LOCKED_UNARY_OP(neg, 32)
2969
2970EMIT_LOCKED_UNARY_OP(inc, 16)
2971EMIT_LOCKED_UNARY_OP(dec, 16)
2972EMIT_LOCKED_UNARY_OP(not, 16)
2973EMIT_LOCKED_UNARY_OP(neg, 16)
2974
2975EMIT_LOCKED_UNARY_OP(inc, 8)
2976EMIT_LOCKED_UNARY_OP(dec, 8)
2977EMIT_LOCKED_UNARY_OP(not, 8)
2978EMIT_LOCKED_UNARY_OP(neg, 8)
2979# endif
2980
2981#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2982
2983
2984/*********************************************************************************************************************************
2985* Shifting and Rotating *
2986*********************************************************************************************************************************/
2987
2988/*
2989 * ROL
2990 */
2991#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2992IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2993{ \
2994 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2995 if (cShift) \
2996 { \
2997 if (a_cBitsWidth < 32) \
2998 cShift &= a_cBitsWidth - 1; \
2999 a_uType const uDst = *puDst; \
3000 a_uType const uResult = a_fnHlp(uDst, cShift); \
3001 *puDst = uResult; \
3002 \
3003 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3004 it the same way as for 1 bit shifts. */ \
3005 AssertCompile(X86_EFL_CF_BIT == 0); \
3006 uint32_t fEfl = *pfEFlags; \
3007 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3008 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3009 fEfl |= fCarry; \
3010 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3011 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3012 else /* Intel 10980XE: According to the first sub-shift: */ \
3013 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3014 *pfEFlags = fEfl; \
3015 } \
3016}
3017
3018#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3020#endif
3021EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3022EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3026#endif
3027EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3028EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3029
3030DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3031{
3032 return (uValue << cShift) | (uValue >> (16 - cShift));
3033}
3034#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3035EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3036#endif
3037EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3038EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3039
3040DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3041{
3042 return (uValue << cShift) | (uValue >> (8 - cShift));
3043}
3044#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3045EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3046#endif
3047EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3048EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3049
3050
3051/*
3052 * ROR
3053 */
3054#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3055IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3056{ \
3057 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3058 if (cShift) \
3059 { \
3060 if (a_cBitsWidth < 32) \
3061 cShift &= a_cBitsWidth - 1; \
3062 a_uType const uDst = *puDst; \
3063 a_uType const uResult = a_fnHlp(uDst, cShift); \
3064 *puDst = uResult; \
3065 \
3066 /* Calc EFLAGS: */ \
3067 AssertCompile(X86_EFL_CF_BIT == 0); \
3068 uint32_t fEfl = *pfEFlags; \
3069 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3070 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3071 fEfl |= fCarry; \
3072 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3073 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3074 else /* Intel 10980XE: According to the first sub-shift: */ \
3075 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3076 *pfEFlags = fEfl; \
3077 } \
3078}
3079
3080#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3081EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3082#endif
3083EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3084EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3085
3086#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3087EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3088#endif
3089EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3090EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3091
3092DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3093{
3094 return (uValue >> cShift) | (uValue << (16 - cShift));
3095}
3096#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3097EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3098#endif
3099EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3100EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3101
3102DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3103{
3104 return (uValue >> cShift) | (uValue << (8 - cShift));
3105}
3106#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3107EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3108#endif
3109EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3110EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3111
3112
3113/*
3114 * RCL
3115 */
3116#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3117IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3118{ \
3119 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3120 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3121 cShift %= a_cBitsWidth + 1; \
3122 if (cShift) \
3123 { \
3124 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3125 cShift %= a_cBitsWidth + 1; \
3126 a_uType const uDst = *puDst; \
3127 a_uType uResult = uDst << cShift; \
3128 if (cShift > 1) \
3129 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3130 \
3131 AssertCompile(X86_EFL_CF_BIT == 0); \
3132 uint32_t fEfl = *pfEFlags; \
3133 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3134 uResult |= (a_uType)fInCarry << (cShift - 1); \
3135 \
3136 *puDst = uResult; \
3137 \
3138 /* Calc EFLAGS. */ \
3139 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3140 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3141 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3142 fEfl |= fOutCarry; \
3143 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3144 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3145 else /* Intel 10980XE: According to the first sub-shift: */ \
3146 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3147 *pfEFlags = fEfl; \
3148 } \
3149}
3150
3151#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3152EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3153#endif
3154EMIT_RCL(64, uint64_t, _intel, 1)
3155EMIT_RCL(64, uint64_t, _amd, 0)
3156
3157#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3158EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3159#endif
3160EMIT_RCL(32, uint32_t, _intel, 1)
3161EMIT_RCL(32, uint32_t, _amd, 0)
3162
3163#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3164EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3165#endif
3166EMIT_RCL(16, uint16_t, _intel, 1)
3167EMIT_RCL(16, uint16_t, _amd, 0)
3168
3169#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3170EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3171#endif
3172EMIT_RCL(8, uint8_t, _intel, 1)
3173EMIT_RCL(8, uint8_t, _amd, 0)
3174
3175
3176/*
3177 * RCR
3178 */
3179#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3180IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3181{ \
3182 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3183 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3184 cShift %= a_cBitsWidth + 1; \
3185 if (cShift) \
3186 { \
3187 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3188 cShift %= a_cBitsWidth + 1; \
3189 a_uType const uDst = *puDst; \
3190 a_uType uResult = uDst >> cShift; \
3191 if (cShift > 1) \
3192 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3193 \
3194 AssertCompile(X86_EFL_CF_BIT == 0); \
3195 uint32_t fEfl = *pfEFlags; \
3196 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3197 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3198 *puDst = uResult; \
3199 \
3200 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3201 it the same way as for 1 bit shifts. */ \
3202 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3203 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3204 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3205 fEfl |= fOutCarry; \
3206 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3207 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3208 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3209 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3210 *pfEFlags = fEfl; \
3211 } \
3212}
3213
3214#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3215EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3216#endif
3217EMIT_RCR(64, uint64_t, _intel, 1)
3218EMIT_RCR(64, uint64_t, _amd, 0)
3219
3220#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3221EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3222#endif
3223EMIT_RCR(32, uint32_t, _intel, 1)
3224EMIT_RCR(32, uint32_t, _amd, 0)
3225
3226#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3227EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3228#endif
3229EMIT_RCR(16, uint16_t, _intel, 1)
3230EMIT_RCR(16, uint16_t, _amd, 0)
3231
3232#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3233EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3234#endif
3235EMIT_RCR(8, uint8_t, _intel, 1)
3236EMIT_RCR(8, uint8_t, _amd, 0)
3237
3238
3239/*
3240 * SHL
3241 */
3242#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3243IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3244{ \
3245 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3246 if (cShift) \
3247 { \
3248 a_uType const uDst = *puDst; \
3249 a_uType uResult = uDst << cShift; \
3250 *puDst = uResult; \
3251 \
3252 /* Calc EFLAGS. */ \
3253 AssertCompile(X86_EFL_CF_BIT == 0); \
3254 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3255 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3256 fEfl |= fCarry; \
3257 if (!a_fIntelFlags) \
3258 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3259 else \
3260 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3261 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3262 fEfl |= X86_EFL_CALC_ZF(uResult); \
3263 fEfl |= g_afParity[uResult & 0xff]; \
3264 if (!a_fIntelFlags) \
3265 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3266 *pfEFlags = fEfl; \
3267 } \
3268}
3269
3270#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3271EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3272#endif
3273EMIT_SHL(64, uint64_t, _intel, 1)
3274EMIT_SHL(64, uint64_t, _amd, 0)
3275
3276#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3277EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3278#endif
3279EMIT_SHL(32, uint32_t, _intel, 1)
3280EMIT_SHL(32, uint32_t, _amd, 0)
3281
3282#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3283EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3284#endif
3285EMIT_SHL(16, uint16_t, _intel, 1)
3286EMIT_SHL(16, uint16_t, _amd, 0)
3287
3288#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3289EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3290#endif
3291EMIT_SHL(8, uint8_t, _intel, 1)
3292EMIT_SHL(8, uint8_t, _amd, 0)
3293
3294
3295/*
3296 * SHR
3297 */
3298#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3299IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3300{ \
3301 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3302 if (cShift) \
3303 { \
3304 a_uType const uDst = *puDst; \
3305 a_uType uResult = uDst >> cShift; \
3306 *puDst = uResult; \
3307 \
3308 /* Calc EFLAGS. */ \
3309 AssertCompile(X86_EFL_CF_BIT == 0); \
3310 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3311 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3312 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3313 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3314 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3315 fEfl |= X86_EFL_CALC_ZF(uResult); \
3316 fEfl |= g_afParity[uResult & 0xff]; \
3317 if (!a_fIntelFlags) \
3318 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3319 *pfEFlags = fEfl; \
3320 } \
3321}
3322
3323#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3324EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3325#endif
3326EMIT_SHR(64, uint64_t, _intel, 1)
3327EMIT_SHR(64, uint64_t, _amd, 0)
3328
3329#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3330EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3331#endif
3332EMIT_SHR(32, uint32_t, _intel, 1)
3333EMIT_SHR(32, uint32_t, _amd, 0)
3334
3335#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3336EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3337#endif
3338EMIT_SHR(16, uint16_t, _intel, 1)
3339EMIT_SHR(16, uint16_t, _amd, 0)
3340
3341#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3342EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3343#endif
3344EMIT_SHR(8, uint8_t, _intel, 1)
3345EMIT_SHR(8, uint8_t, _amd, 0)
3346
3347
3348/*
3349 * SAR
3350 */
3351#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3352IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3353{ \
3354 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3355 if (cShift) \
3356 { \
3357 a_iType const iDst = (a_iType)*puDst; \
3358 a_uType uResult = iDst >> cShift; \
3359 *puDst = uResult; \
3360 \
3361 /* Calc EFLAGS. \
3362 Note! The OF flag is always zero because the result never differs from the input. */ \
3363 AssertCompile(X86_EFL_CF_BIT == 0); \
3364 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3365 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3366 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3367 fEfl |= X86_EFL_CALC_ZF(uResult); \
3368 fEfl |= g_afParity[uResult & 0xff]; \
3369 if (!a_fIntelFlags) \
3370 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3371 *pfEFlags = fEfl; \
3372 } \
3373}
3374
3375#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3376EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3377#endif
3378EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3379EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3380
3381#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3382EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3383#endif
3384EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3385EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3386
3387#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3388EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3389#endif
3390EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3391EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3392
3393#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3394EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3395#endif
3396EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3397EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3398
3399
3400/*
3401 * SHLD
3402 *
3403 * - CF is the last bit shifted out of puDst.
3404 * - AF is always cleared by Intel 10980XE.
3405 * - AF is always set by AMD 3990X.
3406 * - OF is set according to the first shift on Intel 10980XE, it seems.
3407 * - OF is set according to the last sub-shift on AMD 3990X.
3408 * - ZF, SF and PF are calculated according to the result by both vendors.
3409 *
3410 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3411 * pick either the source register or the destination register for input bits
3412 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3413 * intel has changed behaviour here several times. We implement what current
3414 * skylake based does for now, we can extend this later as needed.
3415 */
3416#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3417IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3418 uint32_t *pfEFlags)) \
3419{ \
3420 cShift &= a_cBitsWidth - 1; \
3421 if (cShift) \
3422 { \
3423 a_uType const uDst = *puDst; \
3424 a_uType uResult = uDst << cShift; \
3425 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3426 *puDst = uResult; \
3427 \
3428 /* CALC EFLAGS: */ \
3429 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3430 if (a_fIntelFlags) \
3431 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3432 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3433 else \
3434 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3435 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3436 fEfl |= X86_EFL_AF; \
3437 } \
3438 AssertCompile(X86_EFL_CF_BIT == 0); \
3439 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3440 fEfl |= g_afParity[uResult & 0xff]; \
3441 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3442 fEfl |= X86_EFL_CALC_ZF(uResult); \
3443 *pfEFlags = fEfl; \
3444 } \
3445}
3446
3447#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3448EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3449#endif
3450EMIT_SHLD(64, uint64_t, _intel, 1)
3451EMIT_SHLD(64, uint64_t, _amd, 0)
3452
3453#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3454EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3455#endif
3456EMIT_SHLD(32, uint32_t, _intel, 1)
3457EMIT_SHLD(32, uint32_t, _amd, 0)
3458
3459#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3460IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3461{ \
3462 cShift &= 31; \
3463 if (cShift) \
3464 { \
3465 uint16_t const uDst = *puDst; \
3466 uint64_t const uTmp = a_fIntelFlags \
3467 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3468 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3469 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3470 *puDst = uResult; \
3471 \
3472 /* CALC EFLAGS: */ \
3473 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3474 AssertCompile(X86_EFL_CF_BIT == 0); \
3475 if (a_fIntelFlags) \
3476 { \
3477 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3478 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3479 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3480 } \
3481 else \
3482 { \
3483 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3484 if (cShift < 16) \
3485 { \
3486 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3487 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3488 } \
3489 else \
3490 { \
3491 if (cShift == 16) \
3492 fEfl |= uDst & X86_EFL_CF; \
3493 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3494 } \
3495 fEfl |= X86_EFL_AF; \
3496 } \
3497 fEfl |= g_afParity[uResult & 0xff]; \
3498 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3499 fEfl |= X86_EFL_CALC_ZF(uResult); \
3500 *pfEFlags = fEfl; \
3501 } \
3502}
3503
3504#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3505EMIT_SHLD_16(RT_NOTHING, 1)
3506#endif
3507EMIT_SHLD_16(_intel, 1)
3508EMIT_SHLD_16(_amd, 0)
3509
3510
3511/*
3512 * SHRD
3513 *
3514 * EFLAGS behaviour seems to be the same as with SHLD:
3515 * - CF is the last bit shifted out of puDst.
3516 * - AF is always cleared by Intel 10980XE.
3517 * - AF is always set by AMD 3990X.
3518 * - OF is set according to the first shift on Intel 10980XE, it seems.
3519 * - OF is set according to the last sub-shift on AMD 3990X.
3520 * - ZF, SF and PF are calculated according to the result by both vendors.
3521 *
3522 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3523 * pick either the source register or the destination register for input bits
3524 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3525 * intel has changed behaviour here several times. We implement what current
3526 * skylake based does for now, we can extend this later as needed.
3527 */
3528#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3529IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3530{ \
3531 cShift &= a_cBitsWidth - 1; \
3532 if (cShift) \
3533 { \
3534 a_uType const uDst = *puDst; \
3535 a_uType uResult = uDst >> cShift; \
3536 uResult |= uSrc << (a_cBitsWidth - cShift); \
3537 *puDst = uResult; \
3538 \
3539 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3540 AssertCompile(X86_EFL_CF_BIT == 0); \
3541 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3542 if (a_fIntelFlags) \
3543 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3544 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3545 else \
3546 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3547 if (cShift > 1) /* Set according to last shift. */ \
3548 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3549 else \
3550 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3551 fEfl |= X86_EFL_AF; \
3552 } \
3553 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3554 fEfl |= X86_EFL_CALC_ZF(uResult); \
3555 fEfl |= g_afParity[uResult & 0xff]; \
3556 *pfEFlags = fEfl; \
3557 } \
3558}
3559
3560#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3561EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3562#endif
3563EMIT_SHRD(64, uint64_t, _intel, 1)
3564EMIT_SHRD(64, uint64_t, _amd, 0)
3565
3566#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3567EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3568#endif
3569EMIT_SHRD(32, uint32_t, _intel, 1)
3570EMIT_SHRD(32, uint32_t, _amd, 0)
3571
3572#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3573IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3574{ \
3575 cShift &= 31; \
3576 if (cShift) \
3577 { \
3578 uint16_t const uDst = *puDst; \
3579 uint64_t const uTmp = a_fIntelFlags \
3580 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3581 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3582 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3583 *puDst = uResult; \
3584 \
3585 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3586 AssertCompile(X86_EFL_CF_BIT == 0); \
3587 if (a_fIntelFlags) \
3588 { \
3589 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3590 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3591 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3592 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3593 } \
3594 else \
3595 { \
3596 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3597 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3598 /* AMD 3990X: Set according to last shift. AF always set. */ \
3599 if (cShift > 1) /* Set according to last shift. */ \
3600 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3601 else \
3602 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3603 fEfl |= X86_EFL_AF; \
3604 } \
3605 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3606 fEfl |= X86_EFL_CALC_ZF(uResult); \
3607 fEfl |= g_afParity[uResult & 0xff]; \
3608 *pfEFlags = fEfl; \
3609 } \
3610}
3611
3612#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3613EMIT_SHRD_16(RT_NOTHING, 1)
3614#endif
3615EMIT_SHRD_16(_intel, 1)
3616EMIT_SHRD_16(_amd, 0)
3617
3618
3619/*
3620 * RORX (BMI2)
3621 */
3622#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3623IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3624{ \
3625 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3626}
3627
3628#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3629EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3630#endif
3631#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3632EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3633#endif
3634
3635
3636/*
3637 * SHLX (BMI2)
3638 */
3639#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3640IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3641{ \
3642 cShift &= a_cBitsWidth - 1; \
3643 *puDst = uSrc << cShift; \
3644}
3645
3646#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3647EMIT_SHLX(64, uint64_t, RT_NOTHING)
3648EMIT_SHLX(64, uint64_t, _fallback)
3649#endif
3650#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3651EMIT_SHLX(32, uint32_t, RT_NOTHING)
3652EMIT_SHLX(32, uint32_t, _fallback)
3653#endif
3654
3655
3656/*
3657 * SHRX (BMI2)
3658 */
3659#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3660IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3661{ \
3662 cShift &= a_cBitsWidth - 1; \
3663 *puDst = uSrc >> cShift; \
3664}
3665
3666#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3667EMIT_SHRX(64, uint64_t, RT_NOTHING)
3668EMIT_SHRX(64, uint64_t, _fallback)
3669#endif
3670#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3671EMIT_SHRX(32, uint32_t, RT_NOTHING)
3672EMIT_SHRX(32, uint32_t, _fallback)
3673#endif
3674
3675
3676/*
3677 * SARX (BMI2)
3678 */
3679#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3680IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3681{ \
3682 cShift &= a_cBitsWidth - 1; \
3683 *puDst = (a_iType)uSrc >> cShift; \
3684}
3685
3686#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3687EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3688EMIT_SARX(64, uint64_t, int64_t, _fallback)
3689#endif
3690#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3691EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3692EMIT_SARX(32, uint32_t, int32_t, _fallback)
3693#endif
3694
3695
3696/*
3697 * PDEP (BMI2)
3698 */
3699#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3700IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3701{ \
3702 a_uType uResult = 0; \
3703 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3704 if (fMask & ((a_uType)1 << iMaskBit)) \
3705 { \
3706 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3707 iBit++; \
3708 } \
3709 *puDst = uResult; \
3710}
3711
3712#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3713EMIT_PDEP(64, uint64_t, RT_NOTHING)
3714#endif
3715EMIT_PDEP(64, uint64_t, _fallback)
3716#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3717EMIT_PDEP(32, uint32_t, RT_NOTHING)
3718#endif
3719EMIT_PDEP(32, uint32_t, _fallback)
3720
3721/*
3722 * PEXT (BMI2)
3723 */
3724#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3725IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3726{ \
3727 a_uType uResult = 0; \
3728 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3729 if (fMask & ((a_uType)1 << iMaskBit)) \
3730 { \
3731 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3732 iBit++; \
3733 } \
3734 *puDst = uResult; \
3735}
3736
3737#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3738EMIT_PEXT(64, uint64_t, RT_NOTHING)
3739#endif
3740EMIT_PEXT(64, uint64_t, _fallback)
3741#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3742EMIT_PEXT(32, uint32_t, RT_NOTHING)
3743#endif
3744EMIT_PEXT(32, uint32_t, _fallback)
3745
3746
3747#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3748
3749# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3750/*
3751 * BSWAP
3752 */
3753
3754IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3755{
3756 *puDst = ASMByteSwapU64(*puDst);
3757}
3758
3759
3760IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3761{
3762 *puDst = ASMByteSwapU32(*puDst);
3763}
3764
3765
3766/* Note! undocument, so 32-bit arg */
3767IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3768{
3769#if 0
3770 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3771#else
3772 /* This is the behaviour AMD 3990x (64-bit mode): */
3773 *(uint16_t *)puDst = 0;
3774#endif
3775}
3776
3777# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3778
3779
3780
3781# if defined(IEM_WITHOUT_ASSEMBLY)
3782
3783/*
3784 * LFENCE, SFENCE & MFENCE.
3785 */
3786
3787IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3788{
3789 ASMReadFence();
3790}
3791
3792
3793IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3794{
3795 ASMWriteFence();
3796}
3797
3798
3799IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3800{
3801 ASMMemoryFence();
3802}
3803
3804
3805# ifndef RT_ARCH_ARM64
3806IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3807{
3808 ASMMemoryFence();
3809}
3810# endif
3811
3812# endif
3813
3814#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3815
3816
3817IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3818{
3819 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3820 {
3821 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3822 *pu16Dst |= u16Src & X86_SEL_RPL;
3823
3824 *pfEFlags |= X86_EFL_ZF;
3825 }
3826 else
3827 *pfEFlags &= ~X86_EFL_ZF;
3828}
3829
3830
3831#if defined(IEM_WITHOUT_ASSEMBLY)
3832
3833/*********************************************************************************************************************************
3834* x87 FPU Loads *
3835*********************************************************************************************************************************/
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3838{
3839 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3840 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3841 {
3842 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3843 pFpuRes->r80Result.sj64.fInteger = 1;
3844 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3845 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3846 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3847 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3848 }
3849 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3850 {
3851 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3852 pFpuRes->r80Result.s.uExponent = 0;
3853 pFpuRes->r80Result.s.uMantissa = 0;
3854 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3855 }
3856 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3857 {
3858 /* Subnormal values gets normalized. */
3859 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3860 pFpuRes->r80Result.sj64.fInteger = 1;
3861 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3862 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3863 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3864 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3865 pFpuRes->FSW |= X86_FSW_DE;
3866 if (!(pFpuState->FCW & X86_FCW_DM))
3867 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3868 }
3869 else if (RTFLOAT32U_IS_INF(pr32Val))
3870 {
3871 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3872 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3873 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3874 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3875 }
3876 else
3877 {
3878 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3879 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3880 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3881 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3882 pFpuRes->r80Result.sj64.fInteger = 1;
3883 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3884 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3885 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3886 {
3887 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3888 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3889 pFpuRes->FSW |= X86_FSW_IE;
3890
3891 if (!(pFpuState->FCW & X86_FCW_IM))
3892 {
3893 /* The value is not pushed. */
3894 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3895 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3896 pFpuRes->r80Result.au64[0] = 0;
3897 pFpuRes->r80Result.au16[4] = 0;
3898 }
3899 }
3900 else
3901 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3902 }
3903}
3904
3905
3906IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3907{
3908 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3909 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3910 {
3911 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3912 pFpuRes->r80Result.sj64.fInteger = 1;
3913 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3914 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3915 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3916 }
3917 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3918 {
3919 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3920 pFpuRes->r80Result.s.uExponent = 0;
3921 pFpuRes->r80Result.s.uMantissa = 0;
3922 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3923 }
3924 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3925 {
3926 /* Subnormal values gets normalized. */
3927 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3928 pFpuRes->r80Result.sj64.fInteger = 1;
3929 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3930 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3931 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3932 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3933 pFpuRes->FSW |= X86_FSW_DE;
3934 if (!(pFpuState->FCW & X86_FCW_DM))
3935 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3936 }
3937 else if (RTFLOAT64U_IS_INF(pr64Val))
3938 {
3939 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3940 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3941 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3942 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3943 }
3944 else
3945 {
3946 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3947 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3948 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3949 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3950 pFpuRes->r80Result.sj64.fInteger = 1;
3951 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3952 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3953 {
3954 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3955 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3956 pFpuRes->FSW |= X86_FSW_IE;
3957
3958 if (!(pFpuState->FCW & X86_FCW_IM))
3959 {
3960 /* The value is not pushed. */
3961 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3962 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3963 pFpuRes->r80Result.au64[0] = 0;
3964 pFpuRes->r80Result.au16[4] = 0;
3965 }
3966 }
3967 else
3968 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3969 }
3970}
3971
3972
3973IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3974{
3975 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3976 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3977 /* Raises no exceptions. */
3978 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3979}
3980
3981
3982IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3983{
3984 pFpuRes->r80Result.sj64.fSign = 0;
3985 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3986 pFpuRes->r80Result.sj64.fInteger = 1;
3987 pFpuRes->r80Result.sj64.uFraction = 0;
3988
3989 /*
3990 * FPU status word:
3991 * - TOP is irrelevant, but we must match x86 assembly version.
3992 * - C1 is always cleared as we don't have any stack overflows.
3993 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3994 */
3995 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3996}
3997
3998
3999IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4000{
4001 pFpuRes->r80Result.sj64.fSign = 0;
4002 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4003 pFpuRes->r80Result.sj64.fInteger = 1;
4004 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4005 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4006 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4007 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4008}
4009
4010
4011IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4012{
4013 pFpuRes->r80Result.sj64.fSign = 0;
4014 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4015 pFpuRes->r80Result.sj64.fInteger = 1;
4016 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4017 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4018 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4019}
4020
4021
4022IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4023{
4024 pFpuRes->r80Result.sj64.fSign = 0;
4025 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4026 pFpuRes->r80Result.sj64.fInteger = 1;
4027 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4028 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4029 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4030 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4031}
4032
4033
4034IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4035{
4036 pFpuRes->r80Result.sj64.fSign = 0;
4037 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4038 pFpuRes->r80Result.sj64.fInteger = 1;
4039 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4040 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4041 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4042 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4043}
4044
4045
4046IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4047{
4048 pFpuRes->r80Result.sj64.fSign = 0;
4049 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4050 pFpuRes->r80Result.sj64.fInteger = 1;
4051 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4052 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4053 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4054 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4055}
4056
4057
4058IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4059{
4060 pFpuRes->r80Result.s.fSign = 0;
4061 pFpuRes->r80Result.s.uExponent = 0;
4062 pFpuRes->r80Result.s.uMantissa = 0;
4063 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4064}
4065
4066#define EMIT_FILD(a_cBits) \
4067IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4068 int ## a_cBits ## _t const *piVal)) \
4069{ \
4070 int ## a_cBits ## _t iVal = *piVal; \
4071 if (iVal == 0) \
4072 { \
4073 pFpuRes->r80Result.s.fSign = 0; \
4074 pFpuRes->r80Result.s.uExponent = 0; \
4075 pFpuRes->r80Result.s.uMantissa = 0; \
4076 } \
4077 else \
4078 { \
4079 if (iVal > 0) \
4080 pFpuRes->r80Result.s.fSign = 0; \
4081 else \
4082 { \
4083 pFpuRes->r80Result.s.fSign = 1; \
4084 iVal = -iVal; \
4085 } \
4086 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4087 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4088 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4089 } \
4090 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4091}
4092EMIT_FILD(16)
4093EMIT_FILD(32)
4094EMIT_FILD(64)
4095
4096
4097IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4098{
4099 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4100 if ( pd80Val->s.abPairs[0] == 0
4101 && pd80Val->s.abPairs[1] == 0
4102 && pd80Val->s.abPairs[2] == 0
4103 && pd80Val->s.abPairs[3] == 0
4104 && pd80Val->s.abPairs[4] == 0
4105 && pd80Val->s.abPairs[5] == 0
4106 && pd80Val->s.abPairs[6] == 0
4107 && pd80Val->s.abPairs[7] == 0
4108 && pd80Val->s.abPairs[8] == 0)
4109 {
4110 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4111 pFpuRes->r80Result.s.uExponent = 0;
4112 pFpuRes->r80Result.s.uMantissa = 0;
4113 }
4114 else
4115 {
4116 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4117
4118 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4119 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4120 cPairs--;
4121
4122 uint64_t uVal = 0;
4123 uint64_t uFactor = 1;
4124 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4125 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4126 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4127
4128 unsigned const cBits = ASMBitLastSetU64(uVal);
4129 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4130 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4131 }
4132}
4133
4134
4135/*********************************************************************************************************************************
4136* x87 FPU Stores *
4137*********************************************************************************************************************************/
4138
4139/**
4140 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4141 *
4142 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4143 *
4144 * @returns Updated FPU status word value.
4145 * @param fSignIn Incoming sign indicator.
4146 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4147 * @param iExponentIn Unbiased exponent.
4148 * @param fFcw The FPU control word.
4149 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4150 * @param pr32Dst Where to return the output value, if one should be
4151 * returned.
4152 *
4153 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4154 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4155 */
4156static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4157 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4158{
4159 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4160 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4161 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4162 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4163 ? fRoundingOffMask
4164 : 0;
4165 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4166
4167 /*
4168 * Deal with potential overflows/underflows first, optimizing for none.
4169 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4170 */
4171 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4172 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4173 { /* likely? */ }
4174 /*
4175 * Underflow if the exponent zero or negative. This is attempted mapped
4176 * to a subnormal number when possible, with some additional trickery ofc.
4177 */
4178 else if (iExponentOut <= 0)
4179 {
4180 bool const fIsTiny = iExponentOut < 0
4181 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4182 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4183 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4184 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4185
4186 if (iExponentOut <= 0)
4187 {
4188 uMantissaIn = iExponentOut <= -63
4189 ? uMantissaIn != 0
4190 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4191 fRoundedOff = uMantissaIn & fRoundingOffMask;
4192 if (fRoundedOff && fIsTiny)
4193 fFsw |= X86_FSW_UE;
4194 iExponentOut = 0;
4195 }
4196 }
4197 /*
4198 * Overflow if at or above max exponent value or if we will reach max
4199 * when rounding. Will return +/-zero or +/-max value depending on
4200 * whether we're rounding or not.
4201 */
4202 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4203 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4204 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4205 {
4206 fFsw |= X86_FSW_OE;
4207 if (!(fFcw & X86_FCW_OM))
4208 return fFsw | X86_FSW_ES | X86_FSW_B;
4209 fFsw |= X86_FSW_PE;
4210 if (uRoundingAdd)
4211 fFsw |= X86_FSW_C1;
4212 if (!(fFcw & X86_FCW_PM))
4213 fFsw |= X86_FSW_ES | X86_FSW_B;
4214
4215 pr32Dst->s.fSign = fSignIn;
4216 if (uRoundingAdd)
4217 { /* Zero */
4218 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4219 pr32Dst->s.uFraction = 0;
4220 }
4221 else
4222 { /* Max */
4223 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4224 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4225 }
4226 return fFsw;
4227 }
4228
4229 /*
4230 * Normal or subnormal number.
4231 */
4232 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4233 uint64_t uMantissaOut = uMantissaIn;
4234 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4235 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4236 || fRoundedOff != uRoundingAdd)
4237 {
4238 uMantissaOut = uMantissaIn + uRoundingAdd;
4239 if (uMantissaOut >= uMantissaIn)
4240 { /* likely */ }
4241 else
4242 {
4243 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4244 iExponentOut++;
4245 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4246 fFsw |= X86_FSW_C1;
4247 }
4248 }
4249 else
4250 uMantissaOut = uMantissaIn;
4251
4252 /* Truncate the mantissa and set the return value. */
4253 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4254
4255 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4256 pr32Dst->s.uExponent = iExponentOut;
4257 pr32Dst->s.fSign = fSignIn;
4258
4259 /* Set status flags realted to rounding. */
4260 if (fRoundedOff)
4261 {
4262 fFsw |= X86_FSW_PE;
4263 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4264 fFsw |= X86_FSW_C1;
4265 if (!(fFcw & X86_FCW_PM))
4266 fFsw |= X86_FSW_ES | X86_FSW_B;
4267 }
4268
4269 return fFsw;
4270}
4271
4272
4273/**
4274 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4275 */
4276IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4277 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4278{
4279 uint16_t const fFcw = pFpuState->FCW;
4280 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4281 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4282 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4283 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4284 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4285 {
4286 pr32Dst->s.fSign = pr80Src->s.fSign;
4287 pr32Dst->s.uExponent = 0;
4288 pr32Dst->s.uFraction = 0;
4289 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4290 }
4291 else if (RTFLOAT80U_IS_INF(pr80Src))
4292 {
4293 pr32Dst->s.fSign = pr80Src->s.fSign;
4294 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4295 pr32Dst->s.uFraction = 0;
4296 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4297 }
4298 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4299 {
4300 /* Mapped to +/-QNaN */
4301 pr32Dst->s.fSign = pr80Src->s.fSign;
4302 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4303 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4304 }
4305 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4306 {
4307 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4308 if (fFcw & X86_FCW_IM)
4309 {
4310 pr32Dst->s.fSign = 1;
4311 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4312 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4313 fFsw |= X86_FSW_IE;
4314 }
4315 else
4316 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4317 }
4318 else if (RTFLOAT80U_IS_NAN(pr80Src))
4319 {
4320 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4321 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4322 {
4323 pr32Dst->s.fSign = pr80Src->s.fSign;
4324 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4325 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4326 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4327 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4328 fFsw |= X86_FSW_IE;
4329 }
4330 else
4331 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4332 }
4333 else
4334 {
4335 /* Denormal values causes both an underflow and precision exception. */
4336 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4337 if (fFcw & X86_FCW_UM)
4338 {
4339 pr32Dst->s.fSign = pr80Src->s.fSign;
4340 pr32Dst->s.uExponent = 0;
4341 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4342 {
4343 pr32Dst->s.uFraction = 1;
4344 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4345 if (!(fFcw & X86_FCW_PM))
4346 fFsw |= X86_FSW_ES | X86_FSW_B;
4347 }
4348 else
4349 {
4350 pr32Dst->s.uFraction = 0;
4351 fFsw |= X86_FSW_UE | X86_FSW_PE;
4352 if (!(fFcw & X86_FCW_PM))
4353 fFsw |= X86_FSW_ES | X86_FSW_B;
4354 }
4355 }
4356 else
4357 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4358 }
4359 *pu16FSW = fFsw;
4360}
4361
4362
4363/**
4364 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4365 *
4366 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4367 *
4368 * @returns Updated FPU status word value.
4369 * @param fSignIn Incoming sign indicator.
4370 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4371 * @param iExponentIn Unbiased exponent.
4372 * @param fFcw The FPU control word.
4373 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4374 * @param pr64Dst Where to return the output value, if one should be
4375 * returned.
4376 *
4377 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4378 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4379 */
4380static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4381 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4382{
4383 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4384 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4385 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4386 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4387 ? fRoundingOffMask
4388 : 0;
4389 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4390
4391 /*
4392 * Deal with potential overflows/underflows first, optimizing for none.
4393 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4394 */
4395 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4396 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4397 { /* likely? */ }
4398 /*
4399 * Underflow if the exponent zero or negative. This is attempted mapped
4400 * to a subnormal number when possible, with some additional trickery ofc.
4401 */
4402 else if (iExponentOut <= 0)
4403 {
4404 bool const fIsTiny = iExponentOut < 0
4405 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4406 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4407 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4408 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4409
4410 if (iExponentOut <= 0)
4411 {
4412 uMantissaIn = iExponentOut <= -63
4413 ? uMantissaIn != 0
4414 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4415 fRoundedOff = uMantissaIn & fRoundingOffMask;
4416 if (fRoundedOff && fIsTiny)
4417 fFsw |= X86_FSW_UE;
4418 iExponentOut = 0;
4419 }
4420 }
4421 /*
4422 * Overflow if at or above max exponent value or if we will reach max
4423 * when rounding. Will return +/-zero or +/-max value depending on
4424 * whether we're rounding or not.
4425 */
4426 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4427 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4428 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4429 {
4430 fFsw |= X86_FSW_OE;
4431 if (!(fFcw & X86_FCW_OM))
4432 return fFsw | X86_FSW_ES | X86_FSW_B;
4433 fFsw |= X86_FSW_PE;
4434 if (uRoundingAdd)
4435 fFsw |= X86_FSW_C1;
4436 if (!(fFcw & X86_FCW_PM))
4437 fFsw |= X86_FSW_ES | X86_FSW_B;
4438
4439 pr64Dst->s64.fSign = fSignIn;
4440 if (uRoundingAdd)
4441 { /* Zero */
4442 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4443 pr64Dst->s64.uFraction = 0;
4444 }
4445 else
4446 { /* Max */
4447 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4448 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4449 }
4450 return fFsw;
4451 }
4452
4453 /*
4454 * Normal or subnormal number.
4455 */
4456 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4457 uint64_t uMantissaOut = uMantissaIn;
4458 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4459 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4460 || fRoundedOff != uRoundingAdd)
4461 {
4462 uMantissaOut = uMantissaIn + uRoundingAdd;
4463 if (uMantissaOut >= uMantissaIn)
4464 { /* likely */ }
4465 else
4466 {
4467 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4468 iExponentOut++;
4469 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4470 fFsw |= X86_FSW_C1;
4471 }
4472 }
4473 else
4474 uMantissaOut = uMantissaIn;
4475
4476 /* Truncate the mantissa and set the return value. */
4477 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4478
4479 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4480 pr64Dst->s64.uExponent = iExponentOut;
4481 pr64Dst->s64.fSign = fSignIn;
4482
4483 /* Set status flags realted to rounding. */
4484 if (fRoundedOff)
4485 {
4486 fFsw |= X86_FSW_PE;
4487 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4488 fFsw |= X86_FSW_C1;
4489 if (!(fFcw & X86_FCW_PM))
4490 fFsw |= X86_FSW_ES | X86_FSW_B;
4491 }
4492
4493 return fFsw;
4494}
4495
4496
4497/**
4498 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4499 */
4500IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4501 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4502{
4503 uint16_t const fFcw = pFpuState->FCW;
4504 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4505 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4506 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4507 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4508 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4509 {
4510 pr64Dst->s64.fSign = pr80Src->s.fSign;
4511 pr64Dst->s64.uExponent = 0;
4512 pr64Dst->s64.uFraction = 0;
4513 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4514 }
4515 else if (RTFLOAT80U_IS_INF(pr80Src))
4516 {
4517 pr64Dst->s64.fSign = pr80Src->s.fSign;
4518 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4519 pr64Dst->s64.uFraction = 0;
4520 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4521 }
4522 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4523 {
4524 /* Mapped to +/-QNaN */
4525 pr64Dst->s64.fSign = pr80Src->s.fSign;
4526 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4527 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4528 }
4529 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4530 {
4531 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4532 if (fFcw & X86_FCW_IM)
4533 {
4534 pr64Dst->s64.fSign = 1;
4535 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4536 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4537 fFsw |= X86_FSW_IE;
4538 }
4539 else
4540 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4541 }
4542 else if (RTFLOAT80U_IS_NAN(pr80Src))
4543 {
4544 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4545 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4546 {
4547 pr64Dst->s64.fSign = pr80Src->s.fSign;
4548 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4549 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4550 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4551 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4552 fFsw |= X86_FSW_IE;
4553 }
4554 else
4555 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4556 }
4557 else
4558 {
4559 /* Denormal values causes both an underflow and precision exception. */
4560 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4561 if (fFcw & X86_FCW_UM)
4562 {
4563 pr64Dst->s64.fSign = pr80Src->s.fSign;
4564 pr64Dst->s64.uExponent = 0;
4565 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4566 {
4567 pr64Dst->s64.uFraction = 1;
4568 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4569 if (!(fFcw & X86_FCW_PM))
4570 fFsw |= X86_FSW_ES | X86_FSW_B;
4571 }
4572 else
4573 {
4574 pr64Dst->s64.uFraction = 0;
4575 fFsw |= X86_FSW_UE | X86_FSW_PE;
4576 if (!(fFcw & X86_FCW_PM))
4577 fFsw |= X86_FSW_ES | X86_FSW_B;
4578 }
4579 }
4580 else
4581 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4582 }
4583 *pu16FSW = fFsw;
4584}
4585
4586
4587IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4588 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4589{
4590 /*
4591 * FPU status word:
4592 * - TOP is irrelevant, but we must match x86 assembly version (0).
4593 * - C1 is always cleared as we don't have any stack overflows.
4594 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4595 */
4596 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4597 *pr80Dst = *pr80Src;
4598}
4599
4600
4601/*
4602 *
4603 * Mantissa:
4604 * 63 56 48 40 32 24 16 8 0
4605 * v v v v v v v v v
4606 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4607 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4608 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4609 *
4610 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4611 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4612 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4613 * where we'll drop off all but bit 63.
4614 */
4615#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4616IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4617 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4618{ \
4619 uint16_t const fFcw = pFpuState->FCW; \
4620 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4621 bool const fSignIn = pr80Val->s.fSign; \
4622 \
4623 /* \
4624 * Deal with normal numbers first. \
4625 */ \
4626 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4627 { \
4628 uint64_t uMantissa = pr80Val->s.uMantissa; \
4629 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4630 \
4631 if ((uint32_t)iExponent <= a_cBits - 2) \
4632 { \
4633 unsigned const cShiftOff = 63 - iExponent; \
4634 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4635 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4636 ? RT_BIT_64(cShiftOff - 1) \
4637 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4638 ? fRoundingOffMask \
4639 : 0; \
4640 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4641 \
4642 uMantissa >>= cShiftOff; \
4643 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4644 uMantissa += uRounding; \
4645 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4646 { \
4647 if (fRoundedOff) \
4648 { \
4649 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4650 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4651 else if (uRounding) \
4652 fFsw |= X86_FSW_C1; \
4653 fFsw |= X86_FSW_PE; \
4654 if (!(fFcw & X86_FCW_PM)) \
4655 fFsw |= X86_FSW_ES | X86_FSW_B; \
4656 } \
4657 \
4658 if (!fSignIn) \
4659 *piDst = (a_iType)uMantissa; \
4660 else \
4661 *piDst = -(a_iType)uMantissa; \
4662 } \
4663 else \
4664 { \
4665 /* overflowed after rounding. */ \
4666 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4667 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4668 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4669 \
4670 /* Special case for the integer minimum value. */ \
4671 if (fSignIn) \
4672 { \
4673 *piDst = a_iTypeMin; \
4674 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4675 if (!(fFcw & X86_FCW_PM)) \
4676 fFsw |= X86_FSW_ES | X86_FSW_B; \
4677 } \
4678 else \
4679 { \
4680 fFsw |= X86_FSW_IE; \
4681 if (fFcw & X86_FCW_IM) \
4682 *piDst = a_iTypeMin; \
4683 else \
4684 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4685 } \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 if (!fSignIn) \
4694 { \
4695 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4696 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4697 { \
4698 *piDst = 1; \
4699 fFsw |= X86_FSW_C1; \
4700 } \
4701 else \
4702 *piDst = 0; \
4703 } \
4704 else \
4705 { \
4706 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4707 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4708 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4709 *piDst = 0; \
4710 else \
4711 { \
4712 *piDst = -1; \
4713 fFsw |= X86_FSW_C1; \
4714 } \
4715 } \
4716 fFsw |= X86_FSW_PE; \
4717 if (!(fFcw & X86_FCW_PM)) \
4718 fFsw |= X86_FSW_ES | X86_FSW_B; \
4719 } \
4720 /* \
4721 * Special MIN case. \
4722 */ \
4723 else if ( fSignIn && iExponent == a_cBits - 1 \
4724 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4725 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4726 : uMantissa == RT_BIT_64(63))) \
4727 { \
4728 *piDst = a_iTypeMin; \
4729 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4730 { \
4731 fFsw |= X86_FSW_PE; \
4732 if (!(fFcw & X86_FCW_PM)) \
4733 fFsw |= X86_FSW_ES | X86_FSW_B; \
4734 } \
4735 } \
4736 /* \
4737 * Too large/small number outside the target integer range. \
4738 */ \
4739 else \
4740 { \
4741 fFsw |= X86_FSW_IE; \
4742 if (fFcw & X86_FCW_IM) \
4743 *piDst = a_iTypeIndefinite; \
4744 else \
4745 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4746 } \
4747 } \
4748 /* \
4749 * Map both +0 and -0 to integer zero (signless/+). \
4750 */ \
4751 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4752 *piDst = 0; \
4753 /* \
4754 * Denormals are just really tiny sub-zero numbers that are either rounded \
4755 * to zero, 1 or -1 depending on sign and rounding control. \
4756 */ \
4757 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4758 { \
4759 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4760 *piDst = 0; \
4761 else \
4762 { \
4763 *piDst = fSignIn ? -1 : 1; \
4764 fFsw |= X86_FSW_C1; \
4765 } \
4766 fFsw |= X86_FSW_PE; \
4767 if (!(fFcw & X86_FCW_PM)) \
4768 fFsw |= X86_FSW_ES | X86_FSW_B; \
4769 } \
4770 /* \
4771 * All other special values are considered invalid arguments and result \
4772 * in an IE exception and indefinite value if masked. \
4773 */ \
4774 else \
4775 { \
4776 fFsw |= X86_FSW_IE; \
4777 if (fFcw & X86_FCW_IM) \
4778 *piDst = a_iTypeIndefinite; \
4779 else \
4780 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4781 } \
4782 *pu16FSW = fFsw; \
4783}
4784EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4785EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4786EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4787
4788#endif /*IEM_WITHOUT_ASSEMBLY */
4789
4790
4791/*
4792 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4793 *
4794 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4795 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4796 * thus the @a a_cBitsIn.
4797 */
4798#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4799IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4800 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4801{ \
4802 uint16_t const fFcw = pFpuState->FCW; \
4803 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4804 bool const fSignIn = pr80Val->s.fSign; \
4805 \
4806 /* \
4807 * Deal with normal numbers first. \
4808 */ \
4809 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4810 { \
4811 uint64_t uMantissa = pr80Val->s.uMantissa; \
4812 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4813 \
4814 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4815 { \
4816 unsigned const cShiftOff = 63 - iExponent; \
4817 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4818 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4819 uMantissa >>= cShiftOff; \
4820 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4821 if (!fSignIn) \
4822 *piDst = (a_iType)uMantissa; \
4823 else \
4824 *piDst = -(a_iType)uMantissa; \
4825 \
4826 if (fRoundedOff) \
4827 { \
4828 fFsw |= X86_FSW_PE; \
4829 if (!(fFcw & X86_FCW_PM)) \
4830 fFsw |= X86_FSW_ES | X86_FSW_B; \
4831 } \
4832 } \
4833 /* \
4834 * Tiny sub-zero numbers. \
4835 */ \
4836 else if (iExponent < 0) \
4837 { \
4838 *piDst = 0; \
4839 fFsw |= X86_FSW_PE; \
4840 if (!(fFcw & X86_FCW_PM)) \
4841 fFsw |= X86_FSW_ES | X86_FSW_B; \
4842 } \
4843 /* \
4844 * Special MIN case. \
4845 */ \
4846 else if ( fSignIn && iExponent == a_cBits - 1 \
4847 && (a_cBits < 64 \
4848 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4849 : uMantissa == RT_BIT_64(63)) ) \
4850 { \
4851 *piDst = a_iTypeMin; \
4852 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4853 { \
4854 fFsw |= X86_FSW_PE; \
4855 if (!(fFcw & X86_FCW_PM)) \
4856 fFsw |= X86_FSW_ES | X86_FSW_B; \
4857 } \
4858 } \
4859 /* \
4860 * Figure this weirdness. \
4861 */ \
4862 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4863 { \
4864 *piDst = 0; \
4865 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4866 { \
4867 fFsw |= X86_FSW_PE; \
4868 if (!(fFcw & X86_FCW_PM)) \
4869 fFsw |= X86_FSW_ES | X86_FSW_B; \
4870 } \
4871 } \
4872 /* \
4873 * Too large/small number outside the target integer range. \
4874 */ \
4875 else \
4876 { \
4877 fFsw |= X86_FSW_IE; \
4878 if (fFcw & X86_FCW_IM) \
4879 *piDst = a_iTypeIndefinite; \
4880 else \
4881 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4882 } \
4883 } \
4884 /* \
4885 * Map both +0 and -0 to integer zero (signless/+). \
4886 */ \
4887 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4888 *piDst = 0; \
4889 /* \
4890 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4891 */ \
4892 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4893 { \
4894 *piDst = 0; \
4895 fFsw |= X86_FSW_PE; \
4896 if (!(fFcw & X86_FCW_PM)) \
4897 fFsw |= X86_FSW_ES | X86_FSW_B; \
4898 } \
4899 /* \
4900 * All other special values are considered invalid arguments and result \
4901 * in an IE exception and indefinite value if masked. \
4902 */ \
4903 else \
4904 { \
4905 fFsw |= X86_FSW_IE; \
4906 if (fFcw & X86_FCW_IM) \
4907 *piDst = a_iTypeIndefinite; \
4908 else \
4909 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4910 } \
4911 *pu16FSW = fFsw; \
4912}
4913#if defined(IEM_WITHOUT_ASSEMBLY)
4914EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4915EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4916EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4917#endif
4918EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4919EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4920
4921
4922#if defined(IEM_WITHOUT_ASSEMBLY)
4923
4924IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4925 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4926{
4927 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4928 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4929 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4930 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4931 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4932
4933 uint16_t const fFcw = pFpuState->FCW;
4934 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4935 bool const fSignIn = pr80Src->s.fSign;
4936
4937 /*
4938 * Deal with normal numbers first.
4939 */
4940 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4941 {
4942 uint64_t uMantissa = pr80Src->s.uMantissa;
4943 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4944 if ( (uint32_t)iExponent <= 58
4945 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4946 {
4947 unsigned const cShiftOff = 63 - iExponent;
4948 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4949 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4950 ? RT_BIT_64(cShiftOff - 1)
4951 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4952 ? fRoundingOffMask
4953 : 0;
4954 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4955
4956 uMantissa >>= cShiftOff;
4957 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4958 uMantissa += uRounding;
4959 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4960 {
4961 if (fRoundedOff)
4962 {
4963 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4964 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4965 else if (uRounding)
4966 fFsw |= X86_FSW_C1;
4967 fFsw |= X86_FSW_PE;
4968 if (!(fFcw & X86_FCW_PM))
4969 fFsw |= X86_FSW_ES | X86_FSW_B;
4970 }
4971
4972 pd80Dst->s.fSign = fSignIn;
4973 pd80Dst->s.uPad = 0;
4974 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4975 {
4976 unsigned const uDigits = uMantissa % 100;
4977 uMantissa /= 100;
4978 uint8_t const bLo = uDigits % 10;
4979 uint8_t const bHi = uDigits / 10;
4980 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4981 }
4982 }
4983 else
4984 {
4985 /* overflowed after rounding. */
4986 fFsw |= X86_FSW_IE;
4987 if (fFcw & X86_FCW_IM)
4988 *pd80Dst = s_d80Indefinite;
4989 else
4990 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4991 }
4992 }
4993 /*
4994 * Tiny sub-zero numbers.
4995 */
4996 else if (iExponent < 0)
4997 {
4998 if (!fSignIn)
4999 {
5000 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5001 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5002 {
5003 *pd80Dst = s_ad80One[fSignIn];
5004 fFsw |= X86_FSW_C1;
5005 }
5006 else
5007 *pd80Dst = s_ad80Zeros[fSignIn];
5008 }
5009 else
5010 {
5011 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5012 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5013 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5014 *pd80Dst = s_ad80Zeros[fSignIn];
5015 else
5016 {
5017 *pd80Dst = s_ad80One[fSignIn];
5018 fFsw |= X86_FSW_C1;
5019 }
5020 }
5021 fFsw |= X86_FSW_PE;
5022 if (!(fFcw & X86_FCW_PM))
5023 fFsw |= X86_FSW_ES | X86_FSW_B;
5024 }
5025 /*
5026 * Too large/small number outside the target integer range.
5027 */
5028 else
5029 {
5030 fFsw |= X86_FSW_IE;
5031 if (fFcw & X86_FCW_IM)
5032 *pd80Dst = s_d80Indefinite;
5033 else
5034 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5035 }
5036 }
5037 /*
5038 * Map both +0 and -0 to integer zero (signless/+).
5039 */
5040 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5041 *pd80Dst = s_ad80Zeros[fSignIn];
5042 /*
5043 * Denormals are just really tiny sub-zero numbers that are either rounded
5044 * to zero, 1 or -1 depending on sign and rounding control.
5045 */
5046 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5047 {
5048 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5049 *pd80Dst = s_ad80Zeros[fSignIn];
5050 else
5051 {
5052 *pd80Dst = s_ad80One[fSignIn];
5053 fFsw |= X86_FSW_C1;
5054 }
5055 fFsw |= X86_FSW_PE;
5056 if (!(fFcw & X86_FCW_PM))
5057 fFsw |= X86_FSW_ES | X86_FSW_B;
5058 }
5059 /*
5060 * All other special values are considered invalid arguments and result
5061 * in an IE exception and indefinite value if masked.
5062 */
5063 else
5064 {
5065 fFsw |= X86_FSW_IE;
5066 if (fFcw & X86_FCW_IM)
5067 *pd80Dst = s_d80Indefinite;
5068 else
5069 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5070 }
5071 *pu16FSW = fFsw;
5072}
5073
5074
5075/*********************************************************************************************************************************
5076* FPU Helpers *
5077*********************************************************************************************************************************/
5078AssertCompileSize(RTFLOAT128U, 16);
5079AssertCompileSize(RTFLOAT80U, 10);
5080AssertCompileSize(RTFLOAT64U, 8);
5081AssertCompileSize(RTFLOAT32U, 4);
5082
5083/**
5084 * Normalizes a possible pseudo-normal value.
5085 *
5086 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5087 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5088 * i.e. changing uExponent from 0 to 1.
5089 *
5090 * This macro will declare a RTFLOAT80U with the name given by
5091 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5092 * a normalization was performed.
5093 *
5094 * @note This must be applied before calling SoftFloat with a value that couldbe
5095 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5096 * correctly.
5097 */
5098#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5099 RTFLOAT80U a_r80ValNormalized; \
5100 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5101 { \
5102 a_r80ValNormalized = *a_pr80Val; \
5103 a_r80ValNormalized.s.uExponent = 1; \
5104 a_pr80Val = &a_r80ValNormalized; \
5105 } else do {} while (0)
5106
5107#ifdef IEM_WITH_FLOAT128_FOR_FPU
5108
5109DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5110{
5111 int fNew;
5112 switch (fFcw & X86_FCW_RC_MASK)
5113 {
5114 default:
5115 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5116 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5117 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5118 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5119 }
5120 int fOld = fegetround();
5121 fesetround(fNew);
5122 return fOld;
5123}
5124
5125
5126DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5127{
5128 fesetround(fOld);
5129}
5130
5131DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5132{
5133 RT_NOREF(fFcw);
5134 RTFLOAT128U Tmp;
5135 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5136 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5137 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5138 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5139 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5140 {
5141 Assert(Tmp.s.uExponent == 0);
5142 Tmp.s2.uSignAndExponent++;
5143 }
5144 return *(_Float128 *)&Tmp;
5145}
5146
5147
5148DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5149{
5150 RT_NOREF(fFcw);
5151 RTFLOAT128U Tmp;
5152 *(_Float128 *)&Tmp = rd128ValSrc;
5153 ASMCompilerBarrier();
5154 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5155 {
5156 pr80Dst->s.fSign = Tmp.s64.fSign;
5157 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5158 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5159 | Tmp.s64.uFractionLo >> (64 - 15);
5160
5161 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5162 unsigned const cShiftOff = 64 - 15;
5163 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5164 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5165 if (uRoundedOff)
5166 {
5167 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5168 ? RT_BIT_64(cShiftOff - 1)
5169 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5170 ? fRoundingOffMask
5171 : 0;
5172 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5173 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5174 || uRoundedOff != uRoundingAdd)
5175 {
5176 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5177 {
5178 uFraction += 1;
5179 if (!(uFraction & RT_BIT_64(63)))
5180 { /* likely */ }
5181 else
5182 {
5183 uFraction >>= 1;
5184 pr80Dst->s.uExponent++;
5185 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5186 return fFsw;
5187 }
5188 fFsw |= X86_FSW_C1;
5189 }
5190 }
5191 fFsw |= X86_FSW_PE;
5192 if (!(fFcw & X86_FCW_PM))
5193 fFsw |= X86_FSW_ES | X86_FSW_B;
5194 }
5195 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5196 }
5197 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5198 {
5199 pr80Dst->s.fSign = Tmp.s64.fSign;
5200 pr80Dst->s.uExponent = 0;
5201 pr80Dst->s.uMantissa = 0;
5202 }
5203 else if (RTFLOAT128U_IS_INF(&Tmp))
5204 {
5205 pr80Dst->s.fSign = Tmp.s64.fSign;
5206 pr80Dst->s.uExponent = 0;
5207 pr80Dst->s.uMantissa = 0;
5208 }
5209 return fFsw;
5210}
5211
5212
5213#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5214
5215/** Initializer for the SoftFloat state structure. */
5216# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5217 { \
5218 softfloat_tininess_afterRounding, \
5219 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5220 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5221 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5222 : (uint8_t)softfloat_round_minMag, \
5223 0, \
5224 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5225 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5226 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5227 }
5228
5229/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5230# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5231 ( (a_fFsw) \
5232 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5233 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5234 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5235 ? X86_FSW_ES | X86_FSW_B : 0) )
5236
5237
5238DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5239{
5240 RT_NOREF(fFcw);
5241 Assert(cBits > 64);
5242# if 0 /* rounding does not seem to help */
5243 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5244 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5245 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5246 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5247 {
5248 uint64_t uOld = r128.v[0];
5249 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5250 if (r128.v[0] < uOld)
5251 r128.v[1] += 1;
5252 }
5253# else
5254 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5255# endif
5256 return r128;
5257}
5258
5259
5260DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5261{
5262 RT_NOREF(fFcw);
5263 Assert(cBits > 64);
5264# if 0 /* rounding does not seem to help, not even on constants */
5265 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5266 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5267 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5268 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5269 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5270 {
5271 uint64_t uOld = r128.v[0];
5272 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5273 if (r128.v[0] < uOld)
5274 r128.v[1] += 1;
5275 }
5276 return r128;
5277# else
5278 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5279 return r128;
5280# endif
5281}
5282
5283
5284# if 0 /* unused */
5285DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5286{
5287 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5288 return r128;
5289}
5290# endif
5291
5292
5293/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5294DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5295{
5296 extFloat80_t Tmp;
5297 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5298 Tmp.signif = pr80Val->s2.uMantissa;
5299 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5300 return extF80_to_f128(Tmp, &Ignored);
5301}
5302
5303
5304/**
5305 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5306 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5307 *
5308 * This is only a structure format conversion, nothing else.
5309 */
5310DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5311{
5312 extFloat80_t Tmp;
5313 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5314 Tmp.signif = pr80Val->s2.uMantissa;
5315 return Tmp;
5316}
5317
5318
5319/**
5320 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5321 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5322 *
5323 * This is only a structure format conversion, nothing else.
5324 */
5325DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5326{
5327 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5328 pr80Dst->s2.uMantissa = r80XSrc.signif;
5329 return pr80Dst;
5330}
5331
5332
5333DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5334{
5335 RT_NOREF(fFcw);
5336 RTFLOAT128U Tmp;
5337 *(float128_t *)&Tmp = r128Src;
5338 ASMCompilerBarrier();
5339
5340 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5341 {
5342 pr80Dst->s.fSign = Tmp.s64.fSign;
5343 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5344 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5345 | Tmp.s64.uFractionLo >> (64 - 15);
5346
5347 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5348 unsigned const cShiftOff = 64 - 15;
5349 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5350 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5351 if (uRoundedOff)
5352 {
5353 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5354 ? RT_BIT_64(cShiftOff - 1)
5355 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5356 ? fRoundingOffMask
5357 : 0;
5358 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5359 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5360 || uRoundedOff != uRoundingAdd)
5361 {
5362 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5363 {
5364 uFraction += 1;
5365 if (!(uFraction & RT_BIT_64(63)))
5366 { /* likely */ }
5367 else
5368 {
5369 uFraction >>= 1;
5370 pr80Dst->s.uExponent++;
5371 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5372 return fFsw;
5373 }
5374 fFsw |= X86_FSW_C1;
5375 }
5376 }
5377 fFsw |= X86_FSW_PE;
5378 if (!(fFcw & X86_FCW_PM))
5379 fFsw |= X86_FSW_ES | X86_FSW_B;
5380 }
5381
5382 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5383 }
5384 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5385 {
5386 pr80Dst->s.fSign = Tmp.s64.fSign;
5387 pr80Dst->s.uExponent = 0;
5388 pr80Dst->s.uMantissa = 0;
5389 }
5390 else if (RTFLOAT128U_IS_INF(&Tmp))
5391 {
5392 pr80Dst->s.fSign = Tmp.s64.fSign;
5393 pr80Dst->s.uExponent = 0x7fff;
5394 pr80Dst->s.uMantissa = 0;
5395 }
5396 return fFsw;
5397}
5398
5399
5400/**
5401 * Helper for transfering exception and C1 to FSW and setting the result value
5402 * accordingly.
5403 *
5404 * @returns Updated FSW.
5405 * @param pSoftState The SoftFloat state following the operation.
5406 * @param r80XResult The result of the SoftFloat operation.
5407 * @param pr80Result Where to store the result for IEM.
5408 * @param fFcw The FPU control word.
5409 * @param fFsw The FSW before the operation, with necessary bits
5410 * cleared and such.
5411 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5412 * raised.
5413 */
5414DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5415 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5416 PCRTFLOAT80U pr80XcptResult)
5417{
5418 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5419 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5420 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5421 fFsw |= X86_FSW_ES | X86_FSW_B;
5422
5423 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5424 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5425 else
5426 {
5427 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5428 *pr80Result = *pr80XcptResult;
5429 }
5430 return fFsw;
5431}
5432
5433
5434/**
5435 * Helper doing polynomial evaluation using Horner's method.
5436 *
5437 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5438 */
5439float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5440 unsigned cPrecision, softfloat_state_t *pSoftState)
5441{
5442 Assert(cHornerConsts > 1);
5443 size_t i = cHornerConsts - 1;
5444 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5445 while (i-- > 0)
5446 {
5447 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5448 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5449 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5450 }
5451 return r128Result;
5452}
5453
5454#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5455
5456
5457/**
5458 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5459 * mantissa, exponent and sign.
5460 *
5461 * @returns Updated FSW.
5462 * @param pr80Dst Where to return the composed value.
5463 * @param fSign The sign.
5464 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5465 * ignored and should be zero. This will probably be
5466 * modified during normalization and rounding.
5467 * @param iExponent Unbiased exponent.
5468 * @param fFcw The FPU control word.
5469 * @param fFsw The FPU status word.
5470 */
5471static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5472 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5473{
5474 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5475
5476 iExponent += RTFLOAT80U_EXP_BIAS;
5477
5478 /* Do normalization if necessary and possible. */
5479 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5480 {
5481 int cShift = 192 - RTUInt256BitCount(puMantissa);
5482 if (iExponent > cShift)
5483 iExponent -= cShift;
5484 else
5485 {
5486 if (fFcw & X86_FCW_UM)
5487 {
5488 if (iExponent > 0)
5489 cShift = --iExponent;
5490 else
5491 cShift = 0;
5492 }
5493 iExponent -= cShift;
5494 }
5495 RTUInt256AssignShiftLeft(puMantissa, cShift);
5496 }
5497
5498 /* Do rounding. */
5499 uint64_t uMantissa = puMantissa->QWords.qw2;
5500 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5501 {
5502 bool fAdd;
5503 switch (fFcw & X86_FCW_RC_MASK)
5504 {
5505 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5506 case X86_FCW_RC_NEAREST:
5507 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5508 {
5509 if ( (uMantissa & 1)
5510 || puMantissa->QWords.qw0 != 0
5511 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5512 {
5513 fAdd = true;
5514 break;
5515 }
5516 uMantissa &= ~(uint64_t)1;
5517 }
5518 fAdd = false;
5519 break;
5520 case X86_FCW_RC_ZERO:
5521 fAdd = false;
5522 break;
5523 case X86_FCW_RC_UP:
5524 fAdd = !fSign;
5525 break;
5526 case X86_FCW_RC_DOWN:
5527 fAdd = fSign;
5528 break;
5529 }
5530 if (fAdd)
5531 {
5532 uint64_t const uTmp = uMantissa;
5533 uMantissa = uTmp + 1;
5534 if (uMantissa < uTmp)
5535 {
5536 uMantissa >>= 1;
5537 uMantissa |= RT_BIT_64(63);
5538 iExponent++;
5539 }
5540 fFsw |= X86_FSW_C1;
5541 }
5542 fFsw |= X86_FSW_PE;
5543 if (!(fFcw & X86_FCW_PM))
5544 fFsw |= X86_FSW_ES | X86_FSW_B;
5545 }
5546
5547 /* Check for underflow (denormals). */
5548 if (iExponent <= 0)
5549 {
5550 if (fFcw & X86_FCW_UM)
5551 {
5552 if (uMantissa & RT_BIT_64(63))
5553 uMantissa >>= 1;
5554 iExponent = 0;
5555 }
5556 else
5557 {
5558 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5559 fFsw |= X86_FSW_ES | X86_FSW_B;
5560 }
5561 fFsw |= X86_FSW_UE;
5562 }
5563 /* Check for overflow */
5564 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5565 {
5566 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5567 }
5568
5569 /* Compose the result. */
5570 pr80Dst->s.uMantissa = uMantissa;
5571 pr80Dst->s.uExponent = iExponent;
5572 pr80Dst->s.fSign = fSign;
5573 return fFsw;
5574}
5575
5576
5577/**
5578 * See also iemAImpl_fld_r80_from_r32
5579 */
5580static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5581{
5582 uint16_t fFsw = 0;
5583 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5584 {
5585 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5586 pr80Dst->sj64.fInteger = 1;
5587 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5588 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5589 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5590 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5591 }
5592 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5593 {
5594 pr80Dst->s.fSign = pr32Val->s.fSign;
5595 pr80Dst->s.uExponent = 0;
5596 pr80Dst->s.uMantissa = 0;
5597 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5598 }
5599 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5600 {
5601 /* Subnormal -> normalized + X86_FSW_DE return. */
5602 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5603 pr80Dst->sj64.fInteger = 1;
5604 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5605 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5606 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5607 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5608 fFsw = X86_FSW_DE;
5609 }
5610 else if (RTFLOAT32U_IS_INF(pr32Val))
5611 {
5612 pr80Dst->s.fSign = pr32Val->s.fSign;
5613 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5614 pr80Dst->s.uMantissa = RT_BIT_64(63);
5615 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5616 }
5617 else
5618 {
5619 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5620 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5621 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5622 pr80Dst->sj64.fInteger = 1;
5623 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5624 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5625 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5626 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5627 }
5628 return fFsw;
5629}
5630
5631
5632/**
5633 * See also iemAImpl_fld_r80_from_r64
5634 */
5635static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5636{
5637 uint16_t fFsw = 0;
5638 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5639 {
5640 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5641 pr80Dst->sj64.fInteger = 1;
5642 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5643 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5644 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5645 }
5646 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5647 {
5648 pr80Dst->s.fSign = pr64Val->s.fSign;
5649 pr80Dst->s.uExponent = 0;
5650 pr80Dst->s.uMantissa = 0;
5651 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5652 }
5653 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5654 {
5655 /* Subnormal values gets normalized. */
5656 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5657 pr80Dst->sj64.fInteger = 1;
5658 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5659 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5660 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5661 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5662 fFsw = X86_FSW_DE;
5663 }
5664 else if (RTFLOAT64U_IS_INF(pr64Val))
5665 {
5666 pr80Dst->s.fSign = pr64Val->s.fSign;
5667 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5668 pr80Dst->s.uMantissa = RT_BIT_64(63);
5669 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5670 }
5671 else
5672 {
5673 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5674 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5675 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5676 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5677 pr80Dst->sj64.fInteger = 1;
5678 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5679 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5680 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5681 }
5682 return fFsw;
5683}
5684
5685
5686/**
5687 * See also EMIT_FILD.
5688 */
5689#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5690static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5691{ \
5692 if (iVal == 0) \
5693 { \
5694 pr80Dst->s.fSign = 0; \
5695 pr80Dst->s.uExponent = 0; \
5696 pr80Dst->s.uMantissa = 0; \
5697 } \
5698 else \
5699 { \
5700 if (iVal > 0) \
5701 pr80Dst->s.fSign = 0; \
5702 else \
5703 { \
5704 pr80Dst->s.fSign = 1; \
5705 iVal = -iVal; \
5706 } \
5707 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5708 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5709 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5710 } \
5711 return pr80Dst; \
5712}
5713EMIT_CONVERT_IXX_TO_R80(16)
5714EMIT_CONVERT_IXX_TO_R80(32)
5715//EMIT_CONVERT_IXX_TO_R80(64)
5716
5717/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5718#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5719IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5720{ \
5721 RTFLOAT80U r80Val2; \
5722 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5723 Assert(!fFsw || fFsw == X86_FSW_DE); \
5724 if (fFsw) \
5725 { \
5726 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5727 fFsw = 0; \
5728 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5729 { \
5730 pFpuRes->r80Result = *pr80Val1; \
5731 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5732 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5733 return; \
5734 } \
5735 } \
5736 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5737 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5738}
5739
5740/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5741#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5742IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5743{ \
5744 RTFLOAT80U r80Val2; \
5745 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5746 Assert(!fFsw || fFsw == X86_FSW_DE); \
5747 if (fFsw) \
5748 { \
5749 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5750 fFsw = 0; \
5751 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5752 { \
5753 pFpuRes->r80Result = *pr80Val1; \
5754 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5755 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5756 return; \
5757 } \
5758 } \
5759 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5760 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5761}
5762
5763/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5764#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5765IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5766{ \
5767 RTFLOAT80U r80Val2; \
5768 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5769 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5770}
5771
5772/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5773#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5774IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5775{ \
5776 RTFLOAT80U r80Val2; \
5777 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5778 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5779}
5780
5781
5782
5783/*********************************************************************************************************************************
5784* x86 FPU Division Operations *
5785*********************************************************************************************************************************/
5786
5787/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5788static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5789 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5790{
5791 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5792 {
5793 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5794 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5795 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5796 }
5797 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5798 { /* Div by zero. */
5799 if (fFcw & X86_FCW_ZM)
5800 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5801 else
5802 {
5803 *pr80Result = *pr80Val1Org;
5804 fFsw |= X86_FSW_ES | X86_FSW_B;
5805 }
5806 fFsw |= X86_FSW_ZE;
5807 }
5808 else
5809 { /* Invalid operand */
5810 if (fFcw & X86_FCW_IM)
5811 *pr80Result = g_r80Indefinite;
5812 else
5813 {
5814 *pr80Result = *pr80Val1Org;
5815 fFsw |= X86_FSW_ES | X86_FSW_B;
5816 }
5817 fFsw |= X86_FSW_IE;
5818 }
5819 return fFsw;
5820}
5821
5822
5823IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5824 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5825{
5826 uint16_t const fFcw = pFpuState->FCW;
5827 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5828
5829 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5830 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5831 {
5832 if (fFcw & X86_FCW_IM)
5833 pFpuRes->r80Result = g_r80Indefinite;
5834 else
5835 {
5836 pFpuRes->r80Result = *pr80Val1;
5837 fFsw |= X86_FSW_ES | X86_FSW_B;
5838 }
5839 fFsw |= X86_FSW_IE;
5840 }
5841 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5842 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5843 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5844 {
5845 if (fFcw & X86_FCW_DM)
5846 {
5847 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5848 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5849 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5850 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5851 }
5852 else
5853 {
5854 pFpuRes->r80Result = *pr80Val1;
5855 fFsw |= X86_FSW_ES | X86_FSW_B;
5856 }
5857 fFsw |= X86_FSW_DE;
5858 }
5859 /* SoftFloat can handle the rest: */
5860 else
5861 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5862
5863 pFpuRes->FSW = fFsw;
5864}
5865
5866
5867EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5868EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5869EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5870EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5871
5872
5873IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5874 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5875{
5876 uint16_t const fFcw = pFpuState->FCW;
5877 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5878
5879 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5880 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5881 {
5882 if (fFcw & X86_FCW_IM)
5883 pFpuRes->r80Result = g_r80Indefinite;
5884 else
5885 {
5886 pFpuRes->r80Result = *pr80Val1;
5887 fFsw |= X86_FSW_ES | X86_FSW_B;
5888 }
5889 fFsw |= X86_FSW_IE;
5890 }
5891 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5892 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5893 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5894 {
5895 if (fFcw & X86_FCW_DM)
5896 {
5897 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5898 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5899 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5900 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5901 }
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_DE;
5908 }
5909 /* SoftFloat can handle the rest: */
5910 else
5911 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5912
5913 pFpuRes->FSW = fFsw;
5914}
5915
5916
5917EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5918EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5919EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5920EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5921
5922
5923/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5924static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5925 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5926{
5927 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5928 {
5929 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5930 uint16_t fCxFlags = 0;
5931 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5932 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5933 &fCxFlags, &SoftState);
5934 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5935 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5936 if ( !(fFsw & X86_FSW_IE)
5937 && !RTFLOAT80U_IS_NAN(pr80Result)
5938 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5939 {
5940 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5941 fFsw |= fCxFlags & X86_FSW_C_MASK;
5942 }
5943 return fFsw;
5944 }
5945
5946 /* Invalid operand */
5947 if (fFcw & X86_FCW_IM)
5948 *pr80Result = g_r80Indefinite;
5949 else
5950 {
5951 *pr80Result = *pr80Val1Org;
5952 fFsw |= X86_FSW_ES | X86_FSW_B;
5953 }
5954 return fFsw | X86_FSW_IE;
5955}
5956
5957
5958static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5959 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5960{
5961 uint16_t const fFcw = pFpuState->FCW;
5962 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5963
5964 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5965 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5966 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5967 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5968 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5969 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5970 {
5971 if (fFcw & X86_FCW_IM)
5972 pFpuRes->r80Result = g_r80Indefinite;
5973 else
5974 {
5975 pFpuRes->r80Result = *pr80Val1;
5976 fFsw |= X86_FSW_ES | X86_FSW_B;
5977 }
5978 fFsw |= X86_FSW_IE;
5979 }
5980 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5981 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5982 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5983 {
5984 if (fFcw & X86_FCW_DM)
5985 {
5986 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5987 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5988 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5989 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5990 pr80Val1Org, fLegacyInstr);
5991 }
5992 else
5993 {
5994 pFpuRes->r80Result = *pr80Val1;
5995 fFsw |= X86_FSW_ES | X86_FSW_B;
5996 }
5997 fFsw |= X86_FSW_DE;
5998 }
5999 /* SoftFloat can handle the rest: */
6000 else
6001 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6002 pr80Val1, fLegacyInstr);
6003
6004 pFpuRes->FSW = fFsw;
6005}
6006
6007
6008IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6009 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6010{
6011 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6012}
6013
6014
6015IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6016 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6017{
6018 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6019}
6020
6021
6022/*********************************************************************************************************************************
6023* x87 FPU Multiplication Operations *
6024*********************************************************************************************************************************/
6025
6026/** Worker for iemAImpl_fmul_r80_by_r80. */
6027static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6028 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6029{
6030 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6031 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6032 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6033}
6034
6035
6036IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6037 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6038{
6039 uint16_t const fFcw = pFpuState->FCW;
6040 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6041
6042 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6043 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6044 {
6045 if (fFcw & X86_FCW_IM)
6046 pFpuRes->r80Result = g_r80Indefinite;
6047 else
6048 {
6049 pFpuRes->r80Result = *pr80Val1;
6050 fFsw |= X86_FSW_ES | X86_FSW_B;
6051 }
6052 fFsw |= X86_FSW_IE;
6053 }
6054 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6055 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6056 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6057 {
6058 if (fFcw & X86_FCW_DM)
6059 {
6060 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6061 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6062 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6063 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6064 }
6065 else
6066 {
6067 pFpuRes->r80Result = *pr80Val1;
6068 fFsw |= X86_FSW_ES | X86_FSW_B;
6069 }
6070 fFsw |= X86_FSW_DE;
6071 }
6072 /* SoftFloat can handle the rest: */
6073 else
6074 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6075
6076 pFpuRes->FSW = fFsw;
6077}
6078
6079
6080EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6081EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6082EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6083EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6084
6085
6086/*********************************************************************************************************************************
6087* x87 FPU Addition *
6088*********************************************************************************************************************************/
6089
6090/** Worker for iemAImpl_fadd_r80_by_r80. */
6091static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6092 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6093{
6094 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6095 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6096 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6097}
6098
6099
6100IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6101 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6102{
6103 uint16_t const fFcw = pFpuState->FCW;
6104 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6105
6106 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6107 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6108 {
6109 if (fFcw & X86_FCW_IM)
6110 pFpuRes->r80Result = g_r80Indefinite;
6111 else
6112 {
6113 pFpuRes->r80Result = *pr80Val1;
6114 fFsw |= X86_FSW_ES | X86_FSW_B;
6115 }
6116 fFsw |= X86_FSW_IE;
6117 }
6118 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6119 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6120 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6121 {
6122 if (fFcw & X86_FCW_DM)
6123 {
6124 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6125 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6126 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6127 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6128 }
6129 else
6130 {
6131 pFpuRes->r80Result = *pr80Val1;
6132 fFsw |= X86_FSW_ES | X86_FSW_B;
6133 }
6134 fFsw |= X86_FSW_DE;
6135 }
6136 /* SoftFloat can handle the rest: */
6137 else
6138 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6139
6140 pFpuRes->FSW = fFsw;
6141}
6142
6143
6144EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6145EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6146EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6147EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6148
6149
6150/*********************************************************************************************************************************
6151* x87 FPU Subtraction *
6152*********************************************************************************************************************************/
6153
6154/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6155static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6156 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6157{
6158 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6159 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6160 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6161}
6162
6163
6164IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6165 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6166{
6167 uint16_t const fFcw = pFpuState->FCW;
6168 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6169
6170 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6171 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6172 {
6173 if (fFcw & X86_FCW_IM)
6174 pFpuRes->r80Result = g_r80Indefinite;
6175 else
6176 {
6177 pFpuRes->r80Result = *pr80Val1;
6178 fFsw |= X86_FSW_ES | X86_FSW_B;
6179 }
6180 fFsw |= X86_FSW_IE;
6181 }
6182 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6183 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6184 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6185 {
6186 if (fFcw & X86_FCW_DM)
6187 {
6188 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6189 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6190 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6191 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6192 }
6193 else
6194 {
6195 pFpuRes->r80Result = *pr80Val1;
6196 fFsw |= X86_FSW_ES | X86_FSW_B;
6197 }
6198 fFsw |= X86_FSW_DE;
6199 }
6200 /* SoftFloat can handle the rest: */
6201 else
6202 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6203
6204 pFpuRes->FSW = fFsw;
6205}
6206
6207
6208EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6209EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6210EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6211EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6212
6213
6214/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6215IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6216 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6217{
6218 uint16_t const fFcw = pFpuState->FCW;
6219 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6220
6221 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6222 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6223 {
6224 if (fFcw & X86_FCW_IM)
6225 pFpuRes->r80Result = g_r80Indefinite;
6226 else
6227 {
6228 pFpuRes->r80Result = *pr80Val1;
6229 fFsw |= X86_FSW_ES | X86_FSW_B;
6230 }
6231 fFsw |= X86_FSW_IE;
6232 }
6233 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6234 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6235 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6236 {
6237 if (fFcw & X86_FCW_DM)
6238 {
6239 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6240 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6241 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6242 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6243 }
6244 else
6245 {
6246 pFpuRes->r80Result = *pr80Val1;
6247 fFsw |= X86_FSW_ES | X86_FSW_B;
6248 }
6249 fFsw |= X86_FSW_DE;
6250 }
6251 /* SoftFloat can handle the rest: */
6252 else
6253 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6254
6255 pFpuRes->FSW = fFsw;
6256}
6257
6258
6259EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6260EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6261EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6262EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6263
6264
6265/*********************************************************************************************************************************
6266* x87 FPU Trigometric Operations *
6267*********************************************************************************************************************************/
6268static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6269{
6270 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6271 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6272 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6273 extFloat80_t v;
6274 (void)fFcw;
6275
6276 v = extF80_atan2(y, x, &SoftState);
6277
6278 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6279 return fFsw;
6280}
6281
6282IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6283 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6284{
6285 uint16_t const fFcw = pFpuState->FCW;
6286 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6287
6288 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6289 {
6290 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6291
6292 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6293 if (!(fFcw & X86_FCW_PM))
6294 fFsw |= X86_FSW_ES | X86_FSW_B;
6295 }
6296 else
6297 {
6298 fFsw |= X86_FSW_IE;
6299 if (!(fFcw & X86_FCW_IM))
6300 {
6301 pFpuRes->r80Result = *pr80Val2;
6302 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6303 }
6304 else
6305 {
6306 pFpuRes->r80Result = g_r80Indefinite;
6307 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6308 }
6309 }
6310
6311 pFpuRes->FSW = fFsw;
6312}
6313#endif /* IEM_WITHOUT_ASSEMBLY */
6314
6315IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6316 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6317{
6318 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6319}
6320
6321IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6322 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6323{
6324 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6325}
6326
6327
6328#if defined(IEM_WITHOUT_ASSEMBLY)
6329static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6330{
6331 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6332 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6333 extFloat80_t v;
6334 (void)fFcw;
6335
6336 v = extF80_tan(x, &SoftState);
6337
6338 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6339 return fFsw;
6340}
6341
6342IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6343{
6344 uint16_t const fFcw = pFpuState->FCW;
6345 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6346
6347 if (RTFLOAT80U_IS_ZERO(pr80Val))
6348 {
6349 pFpuResTwo->r80Result1 = *pr80Val;
6350 pFpuResTwo->r80Result2 = g_ar80One[0];
6351 }
6352 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6353 {
6354 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6355 {
6356 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6357 pFpuResTwo->r80Result1 = *pr80Val;
6358 }
6359 else
6360 {
6361 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6362 {
6363 pFpuResTwo->r80Result1 = *pr80Val;
6364 }
6365 else
6366 {
6367 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6368 }
6369
6370 pFpuResTwo->r80Result2 = g_ar80One[0];
6371
6372 fFsw |= X86_FSW_PE;
6373 if (!(fFcw & X86_FCW_PM))
6374 fFsw |= X86_FSW_ES | X86_FSW_B;
6375 }
6376 }
6377 else
6378 {
6379 fFsw |= X86_FSW_IE;
6380 if (!(fFcw & X86_FCW_IM))
6381 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6382 }
6383
6384 pFpuResTwo->FSW = fFsw;
6385}
6386#endif /* IEM_WITHOUT_ASSEMBLY */
6387
6388IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6389{
6390 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6391}
6392
6393IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6394{
6395 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6396}
6397
6398#ifdef IEM_WITHOUT_ASSEMBLY
6399
6400static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6401{
6402 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6403 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6404 extFloat80_t v;
6405 (void)fFcw;
6406
6407 v = extF80_sin(x, &SoftState);
6408
6409 iemFpuSoftF80ToIprt(pr80Result, v);
6410
6411 return fFsw;
6412}
6413
6414IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6415{
6416 uint16_t const fFcw = pFpuState->FCW;
6417 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6418
6419 if (RTFLOAT80U_IS_ZERO(pr80Val))
6420 {
6421 pFpuRes->r80Result = *pr80Val;
6422 }
6423 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6424 {
6425 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6426 {
6427 fFsw |= X86_FSW_C2;
6428 pFpuRes->r80Result = *pr80Val;
6429 }
6430 else
6431 {
6432 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6433 {
6434 pFpuRes->r80Result = *pr80Val;
6435 }
6436 else
6437 {
6438 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6439 }
6440 fFsw |= X86_FSW_PE;
6441 if (!(fFcw & X86_FCW_PM))
6442 fFsw |= X86_FSW_ES | X86_FSW_B;
6443 }
6444 }
6445 else if (RTFLOAT80U_IS_INF(pr80Val))
6446 {
6447 fFsw |= X86_FSW_IE;
6448 if (!(fFcw & X86_FCW_IM))
6449 {
6450 fFsw |= X86_FSW_ES | X86_FSW_B;
6451 pFpuRes->r80Result = *pr80Val;
6452 }
6453 else
6454 {
6455 pFpuRes->r80Result = g_r80Indefinite;
6456 }
6457 }
6458 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6459 {
6460 fFsw |= X86_FSW_DE;
6461
6462 if (fFcw & X86_FCW_DM)
6463 {
6464 if (fFcw & X86_FCW_UM)
6465 {
6466 pFpuRes->r80Result = *pr80Val;
6467 }
6468 else
6469 {
6470 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6471 uint64_t uMantissa = pr80Val->s.uMantissa;
6472 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6473
6474 uExponent = 64 - uExponent;
6475 uMantissa <<= uExponent;
6476 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6477
6478 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6479 pFpuRes->r80Result.s.uMantissa = uMantissa;
6480 pFpuRes->r80Result.s.uExponent = uExponent;
6481 }
6482
6483 fFsw |= X86_FSW_UE | X86_FSW_PE;
6484
6485 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6486 {
6487 /* All the exceptions are masked. */
6488 }
6489 else
6490 {
6491 fFsw |= X86_FSW_ES | X86_FSW_B;
6492 }
6493 }
6494 else
6495 {
6496 pFpuRes->r80Result = *pr80Val;
6497
6498 fFsw |= X86_FSW_ES | X86_FSW_B;
6499 }
6500 }
6501 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6502 {
6503 pFpuRes->r80Result = *pr80Val;
6504 fFsw |= X86_FSW_DE;
6505
6506 if (fFcw & X86_FCW_DM)
6507 {
6508 if (fFcw & X86_FCW_PM)
6509 {
6510 fFsw |= X86_FSW_PE;
6511 }
6512 else
6513 {
6514 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6515 }
6516
6517 pFpuRes->r80Result.sj64.uExponent = 1;
6518 }
6519 else
6520 {
6521 fFsw |= X86_FSW_ES | X86_FSW_B;
6522 }
6523 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6524 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6525 {
6526 pFpuRes->r80Result = *pr80Val;
6527 } else {
6528 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6529 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6530 && (fFcw & X86_FCW_IM))
6531 pFpuRes->r80Result = g_r80Indefinite;
6532 else
6533 {
6534 pFpuRes->r80Result = *pr80Val;
6535 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6536 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6537 }
6538
6539 fFsw |= X86_FSW_IE;
6540 if (!(fFcw & X86_FCW_IM))
6541 fFsw |= X86_FSW_ES | X86_FSW_B;
6542 }
6543
6544 pFpuRes->FSW = fFsw;
6545}
6546#endif /* IEM_WITHOUT_ASSEMBLY */
6547
6548IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6549{
6550 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6551}
6552
6553IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6554{
6555 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6556}
6557
6558#ifdef IEM_WITHOUT_ASSEMBLY
6559
6560static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6561{
6562 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6563 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6564 extFloat80_t v;
6565 (void)fFcw;
6566
6567 v = extF80_cos(x, &SoftState);
6568
6569 iemFpuSoftF80ToIprt(pr80Result, v);
6570
6571 return fFsw;
6572}
6573
6574IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6575{
6576 uint16_t const fFcw = pFpuState->FCW;
6577 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6578
6579 if (RTFLOAT80U_IS_ZERO(pr80Val))
6580 {
6581 pFpuRes->r80Result = g_ar80One[0];
6582 }
6583 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6584 {
6585 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6586 {
6587 fFsw |= X86_FSW_C2;
6588 pFpuRes->r80Result = *pr80Val;
6589 }
6590 else
6591 {
6592 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6593 {
6594 pFpuRes->r80Result = g_ar80One[0];
6595
6596 }
6597 else
6598 {
6599 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6600 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6601 }
6602 fFsw |= X86_FSW_PE;
6603 if (!(fFcw & X86_FCW_PM))
6604 fFsw |= X86_FSW_ES | X86_FSW_B;
6605 }
6606 }
6607 else if (RTFLOAT80U_IS_INF(pr80Val))
6608 {
6609 fFsw |= X86_FSW_IE;
6610 if (!(fFcw & X86_FCW_IM))
6611 {
6612 fFsw |= X86_FSW_ES | X86_FSW_B;
6613 pFpuRes->r80Result = *pr80Val;
6614 }
6615 else
6616 {
6617 pFpuRes->r80Result = g_r80Indefinite;
6618 }
6619 }
6620 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6621 {
6622 fFsw |= X86_FSW_DE;
6623
6624 if (fFcw & X86_FCW_DM)
6625 {
6626 pFpuRes->r80Result = g_ar80One[0];
6627
6628 if (fFcw & X86_FCW_PM)
6629 {
6630 fFsw |= X86_FSW_PE;
6631 }
6632 else
6633 {
6634 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6635 }
6636 }
6637 else
6638 {
6639 pFpuRes->r80Result = *pr80Val;
6640 fFsw |= X86_FSW_ES | X86_FSW_B;
6641 }
6642 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6643 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6644 {
6645 pFpuRes->r80Result = *pr80Val;
6646 } else {
6647 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6648 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6649 && (fFcw & X86_FCW_IM))
6650 pFpuRes->r80Result = g_r80Indefinite;
6651 else
6652 {
6653 pFpuRes->r80Result = *pr80Val;
6654 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6655 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6656 }
6657
6658 fFsw |= X86_FSW_IE;
6659 if (!(fFcw & X86_FCW_IM))
6660 fFsw |= X86_FSW_ES | X86_FSW_B;
6661 }
6662
6663 pFpuRes->FSW = fFsw;
6664}
6665#endif /* IEM_WITHOUT_ASSEMBLY */
6666
6667IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6668{
6669 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6670}
6671
6672IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6673{
6674 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6675}
6676
6677#ifdef IEM_WITHOUT_ASSEMBLY
6678
6679static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6680{
6681 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6682 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6683 extFloat80_t r80Sin, r80Cos;
6684 (void)fFcw;
6685
6686 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6687
6688 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6689 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6690
6691 return fFsw;
6692}
6693
6694IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6695{
6696 uint16_t const fFcw = pFpuState->FCW;
6697 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6698
6699 if (RTFLOAT80U_IS_ZERO(pr80Val))
6700 {
6701 pFpuResTwo->r80Result1 = *pr80Val;
6702 pFpuResTwo->r80Result2 = g_ar80One[0];
6703 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6704 }
6705 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6706 {
6707 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6708 {
6709 fFsw |= X86_FSW_C2;
6710
6711 if (fFcw & X86_FCW_IM)
6712 {
6713 pFpuResTwo->r80Result1 = g_r80Indefinite;
6714 }
6715 else
6716 {
6717 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6718 }
6719
6720 pFpuResTwo->r80Result2 = *pr80Val;
6721 }
6722 else
6723 {
6724 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6725
6726 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6727 {
6728 pFpuResTwo->r80Result1 = *pr80Val;
6729 pFpuResTwo->r80Result2 = g_ar80One[0];
6730 }
6731 else
6732 {
6733 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6734 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6735 }
6736 fFsw |= X86_FSW_PE;
6737 if (!(fFcw & X86_FCW_PM))
6738 fFsw |= X86_FSW_ES | X86_FSW_B;
6739 }
6740 }
6741 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6742 {
6743 fFsw |= X86_FSW_DE;
6744
6745 if (fFcw & X86_FCW_DM)
6746 {
6747 pFpuResTwo->r80Result1 = *pr80Val;
6748 pFpuResTwo->r80Result2 = g_ar80One[0];
6749 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6750
6751 if (fFcw & X86_FCW_PM)
6752 {
6753 fFsw |= X86_FSW_PE;
6754 }
6755 else
6756 {
6757 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6758 }
6759
6760 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6761 }
6762 else
6763 {
6764 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6765 pFpuResTwo->r80Result2 = *pr80Val;
6766 fFsw |= X86_FSW_ES | X86_FSW_B;
6767 }
6768 }
6769 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6770 {
6771 fFsw |= X86_FSW_DE;
6772
6773 if (fFcw & X86_FCW_DM)
6774 {
6775 pFpuResTwo->r80Result2 = g_ar80One[0];
6776
6777 if (fFcw & X86_FCW_UM)
6778 {
6779 pFpuResTwo->r80Result1 = *pr80Val;
6780 }
6781 else
6782 {
6783 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6784 uint64_t uMantissa = pr80Val->s.uMantissa;
6785 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6786
6787 uExponent = 64 - uExponent;
6788 uMantissa <<= uExponent;
6789 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6790
6791 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6792 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6793 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6794 }
6795
6796 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6797 fFsw |= X86_FSW_UE | X86_FSW_PE;
6798
6799 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6800 {
6801 /* All the exceptions are masked. */
6802 }
6803 else
6804 {
6805 fFsw |= X86_FSW_ES | X86_FSW_B;
6806 }
6807 }
6808 else
6809 {
6810 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6811 pFpuResTwo->r80Result2 = *pr80Val;
6812 fFsw |= X86_FSW_ES | X86_FSW_B;
6813 }
6814 }
6815 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6816 {
6817 pFpuResTwo->r80Result1 = *pr80Val;
6818 pFpuResTwo->r80Result2 = *pr80Val;
6819 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6820 }
6821 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6822 {
6823 if (fFcw & X86_FCW_IM)
6824 {
6825 pFpuResTwo->r80Result1 = g_r80Indefinite;
6826 pFpuResTwo->r80Result2 = g_r80Indefinite;
6827 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6828 }
6829 else
6830 {
6831 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6832 pFpuResTwo->r80Result2 = *pr80Val;
6833 }
6834
6835 fFsw |= X86_FSW_IE;
6836 if (!(fFcw & X86_FCW_IM))
6837 fFsw |= X86_FSW_ES | X86_FSW_B;
6838 }
6839 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6840 {
6841 pFpuResTwo->r80Result1 = *pr80Val;
6842 pFpuResTwo->r80Result2 = *pr80Val;
6843
6844 if (fFcw & X86_FCW_IM)
6845 {
6846 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6847 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6848 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6849 }
6850 else
6851 {
6852 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6853 pFpuResTwo->r80Result2 = *pr80Val;
6854 }
6855
6856 fFsw |= X86_FSW_IE;
6857 if (!(fFcw & X86_FCW_IM))
6858 fFsw |= X86_FSW_ES | X86_FSW_B;
6859 }
6860 else if (RTFLOAT80U_IS_INF(pr80Val))
6861 {
6862 if (fFcw & X86_FCW_IM)
6863 {
6864 pFpuResTwo->r80Result1 = g_r80Indefinite;
6865 pFpuResTwo->r80Result2 = g_r80Indefinite;
6866 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6867 }
6868 else
6869 {
6870 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6871 pFpuResTwo->r80Result2 = *pr80Val;
6872 }
6873
6874 fFsw |= X86_FSW_IE;
6875 if (!(fFcw & X86_FCW_IM))
6876 fFsw |= X86_FSW_ES | X86_FSW_B;
6877 }
6878
6879 pFpuResTwo->FSW = fFsw;
6880}
6881#endif /* IEM_WITHOUT_ASSEMBLY */
6882
6883IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6884{
6885 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6886}
6887
6888IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6889{
6890 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6891}
6892
6893#ifdef IEM_WITHOUT_ASSEMBLY
6894
6895
6896/*********************************************************************************************************************************
6897* x87 FPU Compare and Testing Operations *
6898*********************************************************************************************************************************/
6899
6900IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6901{
6902 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6903
6904 if (RTFLOAT80U_IS_ZERO(pr80Val))
6905 fFsw |= X86_FSW_C3;
6906 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6907 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6908 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6909 {
6910 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6911 if (!(pFpuState->FCW & X86_FCW_DM))
6912 fFsw |= X86_FSW_ES | X86_FSW_B;
6913 }
6914 else
6915 {
6916 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6917 if (!(pFpuState->FCW & X86_FCW_IM))
6918 fFsw |= X86_FSW_ES | X86_FSW_B;
6919 }
6920
6921 *pu16Fsw = fFsw;
6922}
6923
6924
6925IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6926{
6927 RT_NOREF(pFpuState);
6928 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6929
6930 /* C1 = sign bit (always, even if empty Intel says). */
6931 if (pr80Val->s.fSign)
6932 fFsw |= X86_FSW_C1;
6933
6934 /* Classify the value in C0, C2, C3. */
6935 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6936 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6937 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6938 fFsw |= X86_FSW_C2;
6939 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6940 fFsw |= X86_FSW_C3;
6941 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6942 fFsw |= X86_FSW_C0;
6943 else if (RTFLOAT80U_IS_INF(pr80Val))
6944 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6945 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6946 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6947 /* whatever else: 0 */
6948
6949 *pu16Fsw = fFsw;
6950}
6951
6952
6953/**
6954 * Worker for fcom, fucom, and friends.
6955 */
6956static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6957 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6958{
6959 /*
6960 * Unpack the values.
6961 */
6962 bool const fSign1 = pr80Val1->s.fSign;
6963 int32_t iExponent1 = pr80Val1->s.uExponent;
6964 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6965
6966 bool const fSign2 = pr80Val2->s.fSign;
6967 int32_t iExponent2 = pr80Val2->s.uExponent;
6968 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6969
6970 /*
6971 * Check for invalid inputs.
6972 */
6973 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6974 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6975 {
6976 if (!(fFcw & X86_FCW_IM))
6977 fFsw |= X86_FSW_ES | X86_FSW_B;
6978 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6979 }
6980
6981 /*
6982 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6983 */
6984 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6985 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6986 {
6987 if ( fIeOnAllNaNs
6988 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6989 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6990 {
6991 fFsw |= X86_FSW_IE;
6992 if (!(fFcw & X86_FCW_IM))
6993 fFsw |= X86_FSW_ES | X86_FSW_B;
6994 }
6995 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6996 }
6997
6998 /*
6999 * Normalize the values.
7000 */
7001 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7002 {
7003 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7004 iExponent1 = 1;
7005 else
7006 {
7007 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7008 uMantissa1 <<= iExponent1;
7009 iExponent1 = 1 - iExponent1;
7010 }
7011 fFsw |= X86_FSW_DE;
7012 if (!(fFcw & X86_FCW_DM))
7013 fFsw |= X86_FSW_ES | X86_FSW_B;
7014 }
7015
7016 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7017 {
7018 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7019 iExponent2 = 1;
7020 else
7021 {
7022 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7023 uMantissa2 <<= iExponent2;
7024 iExponent2 = 1 - iExponent2;
7025 }
7026 fFsw |= X86_FSW_DE;
7027 if (!(fFcw & X86_FCW_DM))
7028 fFsw |= X86_FSW_ES | X86_FSW_B;
7029 }
7030
7031 /*
7032 * Test if equal (val1 == val2):
7033 */
7034 if ( uMantissa1 == uMantissa2
7035 && iExponent1 == iExponent2
7036 && ( fSign1 == fSign2
7037 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7038 fFsw |= X86_FSW_C3;
7039 /*
7040 * Test if less than (val1 < val2):
7041 */
7042 else if (fSign1 && !fSign2)
7043 fFsw |= X86_FSW_C0;
7044 else if (fSign1 == fSign2)
7045 {
7046 /* Zeros are problematic, however at the most one can be zero here. */
7047 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7048 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7049 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7050 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7051
7052 if ( fSign1
7053 ^ ( iExponent1 < iExponent2
7054 || ( iExponent1 == iExponent2
7055 && uMantissa1 < uMantissa2 ) ) )
7056 fFsw |= X86_FSW_C0;
7057 }
7058 /* else: No flags set if greater. */
7059
7060 return fFsw;
7061}
7062
7063
7064IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7065 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7066{
7067 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7068}
7069
7070
7071
7072
7073IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7074 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7075{
7076 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7077}
7078
7079
7080IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7081 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7082{
7083 RTFLOAT80U r80Val2;
7084 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7085 Assert(!fFsw || fFsw == X86_FSW_DE);
7086 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7087 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7088 {
7089 if (!(pFpuState->FCW & X86_FCW_DM))
7090 fFsw |= X86_FSW_ES | X86_FSW_B;
7091 *pfFsw |= fFsw;
7092 }
7093}
7094
7095
7096IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7097 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7098{
7099 RTFLOAT80U r80Val2;
7100 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7101 Assert(!fFsw || fFsw == X86_FSW_DE);
7102 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7103 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7104 {
7105 if (!(pFpuState->FCW & X86_FCW_DM))
7106 fFsw |= X86_FSW_ES | X86_FSW_B;
7107 *pfFsw |= fFsw;
7108 }
7109}
7110
7111
7112IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7113 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7114{
7115 RTFLOAT80U r80Val2;
7116 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7117 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7118}
7119
7120
7121IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7122 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7123{
7124 RTFLOAT80U r80Val2;
7125 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7126 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7127}
7128
7129
7130/**
7131 * Worker for fcomi & fucomi.
7132 */
7133static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7134 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7135{
7136 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7137 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7138 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7139 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7140
7141 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7142 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7143 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7144}
7145
7146
7147IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7148 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7149{
7150 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7151}
7152
7153
7154IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7155 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7156{
7157 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7158}
7159
7160
7161/*********************************************************************************************************************************
7162* x87 FPU Other Operations *
7163*********************************************************************************************************************************/
7164
7165/**
7166 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7167 */
7168static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7169{
7170 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7171 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7172 true /*exact / generate #PE */, &SoftState));
7173 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7174}
7175
7176
7177IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7178{
7179 uint16_t const fFcw = pFpuState->FCW;
7180 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7181
7182 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7183 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7184 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7185 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7186 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7187 || RTFLOAT80U_IS_INF(pr80Val))
7188 pFpuRes->r80Result = *pr80Val;
7189 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7190 {
7191 fFsw |= X86_FSW_DE;
7192 if (fFcw & X86_FCW_DM)
7193 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7194 else
7195 {
7196 pFpuRes->r80Result = *pr80Val;
7197 fFsw |= X86_FSW_ES | X86_FSW_B;
7198 }
7199 }
7200 else
7201 {
7202 if (fFcw & X86_FCW_IM)
7203 {
7204 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7205 pFpuRes->r80Result = g_r80Indefinite;
7206 else
7207 {
7208 pFpuRes->r80Result = *pr80Val;
7209 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7210 }
7211 }
7212 else
7213 {
7214 pFpuRes->r80Result = *pr80Val;
7215 fFsw |= X86_FSW_ES | X86_FSW_B;
7216 }
7217 fFsw |= X86_FSW_IE;
7218 }
7219 pFpuRes->FSW = fFsw;
7220}
7221
7222
7223IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7224 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7225{
7226 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7227 it does everything we need it to do. */
7228 uint16_t const fFcw = pFpuState->FCW;
7229 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7230 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7231 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7232 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7233}
7234
7235
7236/**
7237 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7238 */
7239static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7240{
7241 Assert(!pr80Val->s.fSign);
7242 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7243 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7244 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7245}
7246
7247
7248IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7249{
7250 uint16_t const fFcw = pFpuState->FCW;
7251 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7252
7253 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7254 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7255 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7256 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7257 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7258 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7259 pFpuRes->r80Result = *pr80Val;
7260 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7261 {
7262 fFsw |= X86_FSW_DE;
7263 if (fFcw & X86_FCW_DM)
7264 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7265 else
7266 {
7267 pFpuRes->r80Result = *pr80Val;
7268 fFsw |= X86_FSW_ES | X86_FSW_B;
7269 }
7270 }
7271 else
7272 {
7273 if (fFcw & X86_FCW_IM)
7274 {
7275 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7276 pFpuRes->r80Result = g_r80Indefinite;
7277 else
7278 {
7279 pFpuRes->r80Result = *pr80Val;
7280 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7281 }
7282 }
7283 else
7284 {
7285 pFpuRes->r80Result = *pr80Val;
7286 fFsw |= X86_FSW_ES | X86_FSW_B;
7287 }
7288 fFsw |= X86_FSW_IE;
7289 }
7290 pFpuRes->FSW = fFsw;
7291}
7292
7293
7294/**
7295 * @code{.unparsed}
7296 * x x * ln2
7297 * f(x) = 2 - 1 = e - 1
7298 *
7299 * @endcode
7300 *
7301 * We can approximate e^x by a Taylor/Maclaurin series (see
7302 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7303 * @code{.unparsed}
7304 * n 0 1 2 3 4
7305 * inf x x x x x x
7306 * SUM ----- = --- + --- + --- + --- + --- + ...
7307 * n=0 n! 0! 1! 2! 3! 4!
7308 *
7309 * 2 3 4
7310 * x x x
7311 * = 1 + x + --- + --- + --- + ...
7312 * 2! 3! 4!
7313 * @endcode
7314 *
7315 * Given z = x * ln2, we get:
7316 * @code{.unparsed}
7317 * 2 3 4 n
7318 * z z z z z
7319 * e - 1 = z + --- + --- + --- + ... + ---
7320 * 2! 3! 4! n!
7321 * @endcode
7322 *
7323 * Wanting to use Horner's method, we move one z outside and get:
7324 * @code{.unparsed}
7325 * 2 3 (n-1)
7326 * z z z z
7327 * = z ( 1 + --- + --- + --- + ... + ------- )
7328 * 2! 3! 4! n!
7329 * @endcode
7330 *
7331 * The constants we need for using Horner's methods are 1 and 1 / n!.
7332 *
7333 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7334 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7335 * and can approximate it to be 1.0. For a visual demonstration of this
7336 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7337 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7338 *
7339 *
7340 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7341 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7342 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7343 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7344 * blocks). (The one bit difference is probably an implicit one missing from
7345 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7346 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7347 * exponent.
7348 *
7349 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7350 * successfully reproduced the exact results from an Intel 10980XE, there is
7351 * always a portition of rounding differences. Not going to spend too much time
7352 * on getting this 100% the same, at least not now.
7353 *
7354 * P.S. If someone are really curious about 8087 and its contstants:
7355 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7356 *
7357 *
7358 * @param pr80Val The exponent value (x), less than 1.0, greater than
7359 * -1.0 and not zero. This can be a normal, denormal
7360 * or pseudo-denormal value.
7361 * @param pr80Result Where to return the result.
7362 * @param fFcw FPU control word.
7363 * @param fFsw FPU status word.
7364 */
7365static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7366{
7367 /* As mentioned above, we can skip the expensive polynomial calculation
7368 as it will be close enough to 1.0 that it makes no difference.
7369
7370 The cutoff point for intel 10980XE is exponents >= -69. Intel
7371 also seems to be using a 67-bit or 68-bit constant value, and we get
7372 a smattering of rounding differences if we go for higher precision. */
7373 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7374 {
7375 RTUINT256U u256;
7376 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7377 u256.QWords.qw0 |= 1; /* force #PE */
7378 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7379 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7380 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7381 : 1 - RTFLOAT80U_EXP_BIAS,
7382 fFcw, fFsw);
7383 }
7384 else
7385 {
7386#ifdef IEM_WITH_FLOAT128_FOR_FPU
7387 /* This approach is not good enough for small values, we end up with zero. */
7388 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7389 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7390 _Float128 rd128Result = powf128(2.0L, rd128Val);
7391 rd128Result -= 1.0L;
7392 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7393 iemFpuF128RestoreRounding(fOldRounding);
7394
7395# else
7396 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7397 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7398
7399 /* As mentioned above, enforce 68-bit internal mantissa width to better
7400 match the Intel 10980XE results. */
7401 unsigned const cPrecision = 68;
7402
7403 /* first calculate z = x * ln2 */
7404 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7405 cPrecision);
7406
7407 /* Then do the polynomial evaluation. */
7408 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7409 cPrecision, &SoftState);
7410 r = f128_mul(z, r, &SoftState);
7411
7412 /* Output the result. */
7413 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7414# endif
7415 }
7416 return fFsw;
7417}
7418
7419
7420IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7421{
7422 uint16_t const fFcw = pFpuState->FCW;
7423 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7424
7425 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7426 {
7427 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7428 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7429 else
7430 {
7431 /* Special case:
7432 2^+1.0 - 1.0 = 1.0
7433 2^-1.0 - 1.0 = -0.5 */
7434 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7435 && pr80Val->s.uMantissa == RT_BIT_64(63))
7436 {
7437 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7438 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7439 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7440 }
7441 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7442 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7443 else
7444 pFpuRes->r80Result = *pr80Val;
7445 fFsw |= X86_FSW_PE;
7446 if (!(fFcw & X86_FCW_PM))
7447 fFsw |= X86_FSW_ES | X86_FSW_B;
7448 }
7449 }
7450 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7451 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7452 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7453 pFpuRes->r80Result = *pr80Val;
7454 else if (RTFLOAT80U_IS_INF(pr80Val))
7455 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7456 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7457 {
7458 fFsw |= X86_FSW_DE;
7459 if (fFcw & X86_FCW_DM)
7460 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7461 else
7462 {
7463 pFpuRes->r80Result = *pr80Val;
7464 fFsw |= X86_FSW_ES | X86_FSW_B;
7465 }
7466 }
7467 else
7468 {
7469 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7470 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7471 && (fFcw & X86_FCW_IM))
7472 pFpuRes->r80Result = g_r80Indefinite;
7473 else
7474 {
7475 pFpuRes->r80Result = *pr80Val;
7476 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7477 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7478 }
7479 fFsw |= X86_FSW_IE;
7480 if (!(fFcw & X86_FCW_IM))
7481 fFsw |= X86_FSW_ES | X86_FSW_B;
7482 }
7483 pFpuRes->FSW = fFsw;
7484}
7485
7486#endif /* IEM_WITHOUT_ASSEMBLY */
7487
7488IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7489{
7490 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7491}
7492
7493IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7494{
7495 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7496}
7497
7498#ifdef IEM_WITHOUT_ASSEMBLY
7499
7500IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7501{
7502 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7503 pFpuRes->r80Result = *pr80Val;
7504 pFpuRes->r80Result.s.fSign = 0;
7505}
7506
7507
7508IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7509{
7510 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7511 pFpuRes->r80Result = *pr80Val;
7512 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7513}
7514
7515
7516IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7517{
7518 uint16_t const fFcw = pFpuState->FCW;
7519 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7520
7521 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7522 {
7523 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7524 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7525
7526 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7527 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7528 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7529 }
7530 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7531 {
7532 fFsw |= X86_FSW_ZE;
7533 if (fFcw & X86_FCW_ZM)
7534 {
7535 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7536 pFpuResTwo->r80Result2 = *pr80Val;
7537 }
7538 else
7539 {
7540 pFpuResTwo->r80Result2 = *pr80Val;
7541 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7542 }
7543 }
7544 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7545 {
7546 fFsw |= X86_FSW_DE;
7547 if (fFcw & X86_FCW_DM)
7548 {
7549 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7550 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7551 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7552 int32_t iExponent = -16382;
7553 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7554 {
7555 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7556 iExponent--;
7557 }
7558
7559 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7560 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7561 }
7562 else
7563 {
7564 pFpuResTwo->r80Result2 = *pr80Val;
7565 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7566 }
7567 }
7568 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7569 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7570 {
7571 pFpuResTwo->r80Result1 = *pr80Val;
7572 pFpuResTwo->r80Result2 = *pr80Val;
7573 }
7574 else if (RTFLOAT80U_IS_INF(pr80Val))
7575 {
7576 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7577 pFpuResTwo->r80Result2 = *pr80Val;
7578 }
7579 else
7580 {
7581 if (fFcw & X86_FCW_IM)
7582 {
7583 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7584 pFpuResTwo->r80Result1 = g_r80Indefinite;
7585 else
7586 {
7587 pFpuResTwo->r80Result1 = *pr80Val;
7588 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7589 }
7590 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7591 }
7592 else
7593 {
7594 pFpuResTwo->r80Result2 = *pr80Val;
7595 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7596 }
7597 fFsw |= X86_FSW_IE;
7598 }
7599 pFpuResTwo->FSW = fFsw;
7600}
7601#endif /* IEM_WITHOUT_ASSEMBLY */
7602
7603#if defined(IEM_WITHOUT_ASSEMBLY)
7604
7605static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7606{
7607 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7608 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7609 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7610 extFloat80_t v;
7611 (void)fFcw;
7612
7613 v = extF80_ylog2x(y, x, &SoftState);
7614 iemFpuSoftF80ToIprt(pr80Result, v);
7615
7616 return fFsw;
7617}
7618
7619IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7620 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7621{
7622 uint16_t const fFcw = pFpuState->FCW;
7623 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7624
7625 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7626 {
7627 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7628
7629 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7630 if (!(fFcw & X86_FCW_PM))
7631 fFsw |= X86_FSW_ES | X86_FSW_B;
7632 }
7633 else
7634 {
7635 fFsw |= X86_FSW_IE;
7636
7637 if (!(fFcw & X86_FCW_IM))
7638 {
7639 pFpuRes->r80Result = *pr80Val2;
7640 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7641 }
7642 else
7643 {
7644 pFpuRes->r80Result = g_r80Indefinite;
7645 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7646 }
7647 }
7648
7649 pFpuRes->FSW = fFsw;
7650}
7651#endif /* IEM_WITHOUT_ASSEMBLY */
7652
7653IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7654 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7655{
7656 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7657}
7658
7659IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7660 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7661{
7662 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7663}
7664
7665#if defined(IEM_WITHOUT_ASSEMBLY)
7666
7667static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7668{
7669 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7670 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7671 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7672 extFloat80_t v;
7673 (void)fFcw;
7674
7675 v = extF80_ylog2xp1(y, x, &SoftState);
7676 iemFpuSoftF80ToIprt(pr80Result, v);
7677
7678 return fFsw;
7679}
7680
7681IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7682 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7683{
7684 uint16_t const fFcw = pFpuState->FCW;
7685 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7686
7687 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7688 {
7689 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7690
7691 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7692 if (!(fFcw & X86_FCW_PM))
7693 fFsw |= X86_FSW_ES | X86_FSW_B;
7694 }
7695 else
7696 {
7697 fFsw |= X86_FSW_IE;
7698
7699 if (!(fFcw & X86_FCW_IM))
7700 {
7701 pFpuRes->r80Result = *pr80Val2;
7702 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7703 }
7704 else
7705 {
7706 pFpuRes->r80Result = g_r80Indefinite;
7707 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7708 }
7709 }
7710
7711 pFpuRes->FSW = fFsw;
7712}
7713
7714#endif /* IEM_WITHOUT_ASSEMBLY */
7715
7716IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7717 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7718{
7719 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7720}
7721
7722IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7723 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7724{
7725 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7726}
7727
7728
7729/*********************************************************************************************************************************
7730* MMX, SSE & AVX *
7731*********************************************************************************************************************************/
7732
7733#ifdef IEM_WITH_VEX
7734
7735/*
7736 * VMOVSLDUP
7737 */
7738IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7739{
7740 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7741 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7742 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7743 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7744 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7745 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7746 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7747 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7748}
7749
7750
7751IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7752{
7753 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7754 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7755 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7756 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7757 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7758 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7759 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7760 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7761}
7762
7763#endif /* IEM_WITH_VEX */
7764
7765
7766#ifdef IEM_WITH_VEX
7767
7768/*
7769 * VMOVSHDUP
7770 */
7771IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7772{
7773 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7774 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7775 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7776 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7777 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7778 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7779 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7780 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7781}
7782
7783
7784IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7785{
7786 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7787 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7788 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7789 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7790 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7791 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7792 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7793 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7794}
7795
7796#endif /* IEM_WITH_VEX */
7797
7798
7799#ifdef IEM_WITH_VEX
7800
7801/*
7802 * VMOVDDUP
7803 */
7804IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7805{
7806 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7807 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7808 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7809 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7810}
7811
7812IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7813{
7814 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7815 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7816 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7817 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7818}
7819
7820#endif /* IEM_WITH_VEX */
7821
7822
7823/*
7824 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7825 */
7826#ifdef IEM_WITHOUT_ASSEMBLY
7827
7828IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7829{
7830 RT_NOREF(pFpuState);
7831 *puDst &= *puSrc;
7832}
7833
7834
7835IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7836{
7837 RT_NOREF(pFpuState);
7838 puDst->au64[0] &= puSrc->au64[0];
7839 puDst->au64[1] &= puSrc->au64[1];
7840}
7841
7842#endif
7843
7844IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7845 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7846{
7847 RT_NOREF(pExtState);
7848 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7849 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7850}
7851
7852
7853IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7854 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7855{
7856 RT_NOREF(pExtState);
7857 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7858 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7859 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7860 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7861}
7862
7863
7864/*
7865 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7866 */
7867#ifdef IEM_WITHOUT_ASSEMBLY
7868
7869IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7870{
7871 RT_NOREF(pFpuState);
7872 *puDst = ~*puDst & *puSrc;
7873}
7874
7875
7876IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7877{
7878 RT_NOREF(pFpuState);
7879 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7880 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7881}
7882
7883#endif
7884
7885IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7886 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7887{
7888 RT_NOREF(pExtState);
7889 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7890 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7891}
7892
7893
7894IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7895 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7896{
7897 RT_NOREF(pExtState);
7898 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7899 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7900 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7901 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7902}
7903
7904
7905/*
7906 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7907 */
7908#ifdef IEM_WITHOUT_ASSEMBLY
7909
7910IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7911{
7912 RT_NOREF(pFpuState);
7913 *puDst |= *puSrc;
7914}
7915
7916
7917IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7918{
7919 RT_NOREF(pFpuState);
7920 puDst->au64[0] |= puSrc->au64[0];
7921 puDst->au64[1] |= puSrc->au64[1];
7922}
7923
7924#endif
7925
7926IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7927 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7928{
7929 RT_NOREF(pExtState);
7930 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7931 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7932}
7933
7934
7935IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7936 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7937{
7938 RT_NOREF(pExtState);
7939 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7940 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7941 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7942 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7943}
7944
7945
7946/*
7947 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7948 */
7949#ifdef IEM_WITHOUT_ASSEMBLY
7950
7951IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7952{
7953 RT_NOREF(pFpuState);
7954 *puDst ^= *puSrc;
7955}
7956
7957
7958IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7959{
7960 RT_NOREF(pFpuState);
7961 puDst->au64[0] ^= puSrc->au64[0];
7962 puDst->au64[1] ^= puSrc->au64[1];
7963}
7964
7965#endif
7966
7967IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7968 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7969{
7970 RT_NOREF(pExtState);
7971 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7972 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7973}
7974
7975
7976IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7977 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7978{
7979 RT_NOREF(pExtState);
7980 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7981 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7982 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7983 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7984}
7985
7986
7987/*
7988 * PCMPEQB / VPCMPEQB
7989 */
7990#ifdef IEM_WITHOUT_ASSEMBLY
7991
7992IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7993{
7994 RT_NOREF(pFpuState);
7995 RTUINT64U uSrc1 = { *puDst };
7996 RTUINT64U uSrc2 = { *puSrc };
7997 RTUINT64U uDst;
7998 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7999 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
8000 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
8001 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
8002 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
8003 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
8004 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
8005 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
8006 *puDst = uDst.u;
8007}
8008
8009
8010IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8011{
8012 RT_NOREF(pFpuState);
8013 RTUINT128U uSrc1 = *puDst;
8014 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
8015 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
8016 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
8017 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
8018 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
8019 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
8020 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
8021 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
8022 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
8023 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
8024 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
8025 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
8026 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
8027 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
8028 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
8029 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
8030}
8031
8032#endif
8033
8034IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8035 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8036{
8037 RT_NOREF(pExtState);
8038 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8039 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8040 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8041 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8042 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8043 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8044 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8045 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8046 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8047 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8048 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8049 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8050 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8051 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8052 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8053 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8054}
8055
8056IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8057 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8058{
8059 RT_NOREF(pExtState);
8060 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8061 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8062 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8063 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8064 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8065 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8066 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8067 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8068 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8069 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8070 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8071 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8072 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8073 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8074 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8075 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8076 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8077 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8078 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8079 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8080 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8081 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8082 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8083 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8084 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8085 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8086 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8087 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8088 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8089 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8090 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8091 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8092}
8093
8094
8095/*
8096 * PCMPEQW / VPCMPEQW
8097 */
8098#ifdef IEM_WITHOUT_ASSEMBLY
8099
8100IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8101{
8102 RT_NOREF(pFpuState);
8103 RTUINT64U uSrc1 = { *puDst };
8104 RTUINT64U uSrc2 = { *puSrc };
8105 RTUINT64U uDst;
8106 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8107 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8108 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8109 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8110 *puDst = uDst.u;
8111}
8112
8113
8114IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8115{
8116 RT_NOREF(pFpuState);
8117 RTUINT128U uSrc1 = *puDst;
8118 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8119 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8120 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8121 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8122 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8123 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8124 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8125 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8126}
8127
8128#endif
8129
8130IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8131 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8132{
8133 RT_NOREF(pExtState);
8134 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8135 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8136 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8137 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8138 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8139 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8140 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8141 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8142}
8143
8144IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8145 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8146{
8147 RT_NOREF(pExtState);
8148 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8149 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8150 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8151 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8152 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8153 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8154 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8155 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8156 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8157 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8158 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8159 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8160 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8161 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8162 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8163 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8164}
8165
8166
8167/*
8168 * PCMPEQD / VPCMPEQD.
8169 */
8170#ifdef IEM_WITHOUT_ASSEMBLY
8171
8172IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8173{
8174 RT_NOREF(pFpuState);
8175 RTUINT64U uSrc1 = { *puDst };
8176 RTUINT64U uSrc2 = { *puSrc };
8177 RTUINT64U uDst;
8178 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8179 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8180 *puDst = uDst.u;
8181}
8182
8183
8184IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8185{
8186 RT_NOREF(pFpuState);
8187 RTUINT128U uSrc1 = *puDst;
8188 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8189 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8190 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8191 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8192}
8193
8194#endif /* IEM_WITHOUT_ASSEMBLY */
8195
8196IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8197 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8198{
8199 RT_NOREF(pExtState);
8200 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8201 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8202 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8203 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8204}
8205
8206IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8207 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8208{
8209 RT_NOREF(pExtState);
8210 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8211 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8212 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8213 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8214 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8215 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8216 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8217 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8218}
8219
8220
8221/*
8222 * PCMPEQQ / VPCMPEQQ.
8223 */
8224IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8225{
8226 RT_NOREF(pFpuState);
8227 RTUINT128U uSrc1 = *puDst;
8228 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8229 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8230}
8231
8232IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8233 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8234{
8235 RT_NOREF(pExtState);
8236 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8237 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8238}
8239
8240IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8241 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8242{
8243 RT_NOREF(pExtState);
8244 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8245 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8246 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8247 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8248}
8249
8250
8251/*
8252 * PCMPGTB / VPCMPGTB
8253 */
8254#ifdef IEM_WITHOUT_ASSEMBLY
8255
8256IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8257{
8258 RT_NOREF(pFpuState);
8259 RTUINT64U uSrc1 = { *puDst };
8260 RTUINT64U uSrc2 = { *puSrc };
8261 RTUINT64U uDst;
8262 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8263 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8264 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8265 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8266 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8267 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8268 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8269 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8270 *puDst = uDst.u;
8271}
8272
8273
8274IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8275{
8276 RT_NOREF(pFpuState);
8277 RTUINT128U uSrc1 = *puDst;
8278 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8279 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8280 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8281 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8282 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8283 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8284 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8285 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8286 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8287 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8288 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8289 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8290 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8291 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8292 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8293 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8294}
8295
8296#endif
8297
8298IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8299 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8300{
8301 RT_NOREF(pExtState);
8302 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8303 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8304 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8305 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8306 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8307 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8308 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8309 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8310 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8311 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8312 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8313 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8314 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8315 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8316 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8317 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8318}
8319
8320IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8321 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8322{
8323 RT_NOREF(pExtState);
8324 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8325 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8326 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8327 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8328 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8329 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8330 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8331 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8332 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8333 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8334 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8335 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8336 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8337 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8338 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8339 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8340 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8341 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8342 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8343 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8344 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8345 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8346 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8347 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8348 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8349 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8350 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8351 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8352 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8353 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8354 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8355 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8356}
8357
8358
8359/*
8360 * PCMPGTW / VPCMPGTW
8361 */
8362#ifdef IEM_WITHOUT_ASSEMBLY
8363
8364IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8365{
8366 RT_NOREF(pFpuState);
8367 RTUINT64U uSrc1 = { *puDst };
8368 RTUINT64U uSrc2 = { *puSrc };
8369 RTUINT64U uDst;
8370 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8371 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8372 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8373 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8374 *puDst = uDst.u;
8375}
8376
8377
8378IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8379{
8380 RT_NOREF(pFpuState);
8381 RTUINT128U uSrc1 = *puDst;
8382 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8383 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8384 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8385 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8386 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8387 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8388 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8389 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8390}
8391
8392#endif
8393
8394IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8395 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8396{
8397 RT_NOREF(pExtState);
8398 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8399 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8400 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8401 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8402 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8403 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8404 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8405 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8406}
8407
8408IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8409 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8410{
8411 RT_NOREF(pExtState);
8412 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8413 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8414 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8415 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8416 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8417 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8418 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8419 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8420 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8421 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8422 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8423 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8424 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8425 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8426 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8427 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8428}
8429
8430
8431/*
8432 * PCMPGTD / VPCMPGTD.
8433 */
8434#ifdef IEM_WITHOUT_ASSEMBLY
8435
8436IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8437{
8438 RT_NOREF(pFpuState);
8439 RTUINT64U uSrc1 = { *puDst };
8440 RTUINT64U uSrc2 = { *puSrc };
8441 RTUINT64U uDst;
8442 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8443 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8444 *puDst = uDst.u;
8445}
8446
8447
8448IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8449{
8450 RT_NOREF(pFpuState);
8451 RTUINT128U uSrc1 = *puDst;
8452 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8453 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8454 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8455 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8456}
8457
8458#endif /* IEM_WITHOUT_ASSEMBLY */
8459
8460IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8461 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8462{
8463 RT_NOREF(pExtState);
8464 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8465 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8466 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8467 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8468}
8469
8470IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8471 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8472{
8473 RT_NOREF(pExtState);
8474 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8475 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8476 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8477 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8478 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8479 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8480 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8481 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8482}
8483
8484
8485/*
8486 * PCMPGTQ / VPCMPGTQ.
8487 */
8488IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8489{
8490 RT_NOREF(pFpuState);
8491 RTUINT128U uSrc1 = *puDst;
8492 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8493 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8494}
8495
8496IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8497 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8498{
8499 RT_NOREF(pExtState);
8500 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8501 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8502}
8503
8504IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8505 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8506{
8507 RT_NOREF(pExtState);
8508 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8509 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8510 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8511 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8512}
8513
8514
8515/*
8516 * PADDB / VPADDB
8517 */
8518#ifdef IEM_WITHOUT_ASSEMBLY
8519
8520IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8521{
8522 RT_NOREF(pFpuState);
8523 RTUINT64U uSrc1 = { *puDst };
8524 RTUINT64U uSrc2 = { *puSrc };
8525 RTUINT64U uDst;
8526 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8527 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8528 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8529 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8530 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8531 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8532 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8533 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8534 *puDst = uDst.u;
8535}
8536
8537
8538IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8539{
8540 RT_NOREF(pFpuState);
8541 RTUINT128U uSrc1 = *puDst;
8542 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8543 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8544 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8545 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8546 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8547 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8548 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8549 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8550 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8551 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8552 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8553 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8554 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8555 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8556 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8557 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8558}
8559
8560#endif
8561
8562
8563IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8564 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8565{
8566 RT_NOREF(pExtState);
8567 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8568 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8569 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8570 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8571 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8572 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8573 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8574 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8575 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8576 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8577 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8578 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8579 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8580 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8581 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8582 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8583}
8584
8585IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8586 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8587{
8588 RT_NOREF(pExtState);
8589 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8590 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8591 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8592 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8593 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8594 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8595 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8596 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8597 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8598 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8599 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8600 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8601 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8602 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8603 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8604 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8605 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8606 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8607 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8608 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8609 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8610 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8611 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8612 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8613 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8614 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8615 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8616 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8617 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8618 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8619 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8620 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8621}
8622
8623
8624/*
8625 * PADDSB / VPADDSB
8626 */
8627#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8628 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8629 ? (uint8_t)(a_iWord) \
8630 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8631
8632#ifdef IEM_WITHOUT_ASSEMBLY
8633
8634IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8635{
8636 RT_NOREF(pFpuState);
8637 RTUINT64U uSrc1 = { *puDst };
8638 RTUINT64U uSrc2 = { *puSrc };
8639 RTUINT64U uDst;
8640 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8641 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8642 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8643 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8644 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8645 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8646 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8647 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8648 *puDst = uDst.u;
8649}
8650
8651
8652IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8653{
8654 RT_NOREF(pFpuState);
8655 RTUINT128U uSrc1 = *puDst;
8656 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8657 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8658 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8659 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8660 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8661 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8662 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8663 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8664 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8665 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8666 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8667 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8668 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8669 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8670 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8671 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8672}
8673
8674#endif
8675
8676IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8677 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8678{
8679 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8680 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8681 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8682 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8683 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8684 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8685 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8686 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8687 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8688 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8689 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8690 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8691 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8692 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8693 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8694 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8695}
8696
8697IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8698 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8699{
8700 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8701 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8702 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8703 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8704 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8705 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8706 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8707 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8708 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8709 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8710 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8711 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8712 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8713 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8714 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8715 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8716 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8717 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8718 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8719 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8720 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8721 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8722 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8723 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8724 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8725 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8726 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8727 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8728 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8729 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8730 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8731 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8732}
8733
8734
8735/*
8736 * PADDUSB / VPADDUSB
8737 */
8738#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8739 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8740 ? (uint8_t)(a_uWord) \
8741 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8742
8743#ifdef IEM_WITHOUT_ASSEMBLY
8744
8745IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8746{
8747 RT_NOREF(pFpuState);
8748 RTUINT64U uSrc1 = { *puDst };
8749 RTUINT64U uSrc2 = { *puSrc };
8750 RTUINT64U uDst;
8751 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8752 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8753 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8754 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8755 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8756 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8757 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8758 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8759 *puDst = uDst.u;
8760}
8761
8762
8763IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8764{
8765 RT_NOREF(pFpuState);
8766 RTUINT128U uSrc1 = *puDst;
8767 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8768 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8769 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8770 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8771 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8772 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8773 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8774 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8775 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8776 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8777 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8778 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8779 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8780 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8781 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8782 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8783}
8784
8785#endif
8786
8787IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8788 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8789{
8790 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8791 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8792 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8793 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8794 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8795 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8796 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8797 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8798 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8799 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8800 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8801 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8802 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8803 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8804 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8805 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8806}
8807
8808IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8809 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8810{
8811 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8812 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8813 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8814 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8815 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8816 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8817 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8818 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8819 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8820 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8821 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8822 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8823 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8824 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8825 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8826 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8827 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8828 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8829 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8830 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8831 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8832 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8833 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8834 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8835 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8836 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8837 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8838 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8839 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8840 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8841 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8842 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8843}
8844
8845
8846/*
8847 * PADDW / VPADDW
8848 */
8849#ifdef IEM_WITHOUT_ASSEMBLY
8850
8851IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8852{
8853 RT_NOREF(pFpuState);
8854 RTUINT64U uSrc1 = { *puDst };
8855 RTUINT64U uSrc2 = { *puSrc };
8856 RTUINT64U uDst;
8857 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8858 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8859 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8860 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8861 *puDst = uDst.u;
8862}
8863
8864
8865IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8866{
8867 RT_NOREF(pFpuState);
8868 RTUINT128U uSrc1 = *puDst;
8869 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8870 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8871 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8872 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8873 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8874 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8875 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8876 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8877}
8878
8879#endif
8880
8881
8882IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8883 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8884{
8885 RT_NOREF(pExtState);
8886 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8887 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8888 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8889 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8890 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8891 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8892 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8893 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8894}
8895
8896IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8897 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8898{
8899 RT_NOREF(pExtState);
8900 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8901 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8902 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8903 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8904 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8905 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8906 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8907 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8908 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8909 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8910 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8911 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8912 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8913 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8914 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8915 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8916}
8917
8918
8919/*
8920 * PADDSW / VPADDSW
8921 */
8922#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8923 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8924 ? (uint16_t)(a_iDword) \
8925 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8926
8927#ifdef IEM_WITHOUT_ASSEMBLY
8928
8929IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8930{
8931 RT_NOREF(pFpuState);
8932 RTUINT64U uSrc1 = { *puDst };
8933 RTUINT64U uSrc2 = { *puSrc };
8934 RTUINT64U uDst;
8935 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8936 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8937 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8938 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8939 *puDst = uDst.u;
8940}
8941
8942
8943IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8944{
8945 RT_NOREF(pFpuState);
8946 RTUINT128U uSrc1 = *puDst;
8947 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8948 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8949 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8950 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8951 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8952 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8953 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8954 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8955}
8956
8957#endif
8958
8959IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8960 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8961{
8962 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8963 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8964 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8965 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8966 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8967 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8968 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8969 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8970}
8971
8972IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8973 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8974{
8975 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8976 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8977 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8978 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8979 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8980 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8981 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8982 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8983 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8984 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8985 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8986 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8987 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8988 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8989 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8990 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8991}
8992
8993
8994/*
8995 * PADDUSW / VPADDUSW
8996 */
8997#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8998 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8999 ? (uint16_t)(a_uDword) \
9000 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
9001
9002#ifdef IEM_WITHOUT_ASSEMBLY
9003
9004IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9005{
9006 RT_NOREF(pFpuState);
9007 RTUINT64U uSrc1 = { *puDst };
9008 RTUINT64U uSrc2 = { *puSrc };
9009 RTUINT64U uDst;
9010 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
9011 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
9012 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
9013 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
9014 *puDst = uDst.u;
9015}
9016
9017
9018IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9019{
9020 RT_NOREF(pFpuState);
9021 RTUINT128U uSrc1 = *puDst;
9022 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
9023 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
9024 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
9025 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
9026 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
9027 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
9028 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
9029 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
9030}
9031
9032#endif
9033
9034IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
9035 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9036{
9037 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9038 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9039 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9040 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9041 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9042 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9043 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9044 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9045}
9046
9047IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
9048 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9049{
9050 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9051 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9052 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9053 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9054 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9055 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9056 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9057 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9058 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
9059 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
9060 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
9061 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
9062 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
9063 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
9064 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
9065 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
9066}
9067
9068
9069/*
9070 * PADDD / VPADDD.
9071 */
9072#ifdef IEM_WITHOUT_ASSEMBLY
9073
9074IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9075{
9076 RT_NOREF(pFpuState);
9077 RTUINT64U uSrc1 = { *puDst };
9078 RTUINT64U uSrc2 = { *puSrc };
9079 RTUINT64U uDst;
9080 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
9081 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
9082 *puDst = uDst.u;
9083}
9084
9085
9086IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9087{
9088 RT_NOREF(pFpuState);
9089 RTUINT128U uSrc1 = *puDst;
9090 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
9091 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
9092 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
9093 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
9094}
9095
9096#endif /* IEM_WITHOUT_ASSEMBLY */
9097
9098IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9099 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9100{
9101 RT_NOREF(pExtState);
9102 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9103 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9104 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9105 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9106}
9107
9108IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9109 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9110{
9111 RT_NOREF(pExtState);
9112 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9113 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9114 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9115 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9116 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
9117 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
9118 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
9119 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
9120}
9121
9122
9123/*
9124 * PADDQ / VPADDQ.
9125 */
9126#ifdef IEM_WITHOUT_ASSEMBLY
9127
9128IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9129{
9130 RT_NOREF(pFpuState);
9131 *puDst = *puDst + *puSrc;
9132}
9133
9134IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9135{
9136 RT_NOREF(pFpuState);
9137 RTUINT128U uSrc1 = *puDst;
9138 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
9139 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
9140}
9141
9142#endif
9143
9144IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9145 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9146{
9147 RT_NOREF(pExtState);
9148 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9149 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9150}
9151
9152IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9153 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9154{
9155 RT_NOREF(pExtState);
9156 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9157 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9158 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9159 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9160}
9161
9162
9163/*
9164 * PSUBB / VPSUBB
9165 */
9166#ifdef IEM_WITHOUT_ASSEMBLY
9167
9168IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9169{
9170 RT_NOREF(pFpuState);
9171 RTUINT64U uSrc1 = { *puDst };
9172 RTUINT64U uSrc2 = { *puSrc };
9173 RTUINT64U uDst;
9174 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9175 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9176 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9177 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9178 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9179 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9180 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9181 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9182 *puDst = uDst.u;
9183}
9184
9185
9186IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9187{
9188 RT_NOREF(pFpuState);
9189 RTUINT128U uSrc1 = *puDst;
9190 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9191 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9192 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9193 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9194 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9195 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9196 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9197 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9198 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9199 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9200 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9201 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9202 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9203 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9204 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9205 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9206}
9207
9208#endif
9209
9210IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9211 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9212{
9213 RT_NOREF(pExtState);
9214 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9215 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9216 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9217 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9218 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9219 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9220 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9221 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9222 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9223 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9224 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9225 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9226 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9227 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9228 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9229 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9230}
9231
9232IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9233 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9234{
9235 RT_NOREF(pExtState);
9236 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9237 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9238 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9239 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9240 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9241 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9242 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9243 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9244 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9245 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9246 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9247 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9248 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9249 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9250 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9251 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9252 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9253 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9254 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9255 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9256 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9257 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9258 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9259 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9260 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9261 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9262 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9263 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9264 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9265 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9266 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9267 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9268}
9269
9270
9271/*
9272 * PSUBSB / VSUBSB
9273 */
9274#ifdef IEM_WITHOUT_ASSEMBLY
9275
9276IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9277{
9278 RT_NOREF(pFpuState);
9279 RTUINT64U uSrc1 = { *puDst };
9280 RTUINT64U uSrc2 = { *puSrc };
9281 RTUINT64U uDst;
9282 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9283 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9284 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9285 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9286 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9287 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9288 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9289 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9290 *puDst = uDst.u;
9291}
9292
9293
9294IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9295{
9296 RT_NOREF(pFpuState);
9297 RTUINT128U uSrc1 = *puDst;
9298 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9299 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9300 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9301 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9302 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9303 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9304 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9305 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9306 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9307 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9308 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9309 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9310 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9311 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9312 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9313 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9314}
9315
9316#endif
9317
9318IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9319 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9320{
9321 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9322 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9323 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9324 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9325 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9326 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9327 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9328 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9329 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9330 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9331 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9332 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9333 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9334 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9335 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9336 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9337}
9338
9339IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9340 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9341{
9342 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9343 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9344 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9345 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9346 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9347 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9348 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9349 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9350 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9351 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9352 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9353 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9354 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9355 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9356 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9357 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9358 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9359 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9360 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9361 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9362 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9363 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9364 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9365 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9366 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9367 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9368 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9369 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9370 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9371 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9372 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9373 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9374}
9375
9376
9377/*
9378 * PSUBUSB / VPSUBUSW
9379 */
9380#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9381 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9382 ? (uint8_t)(a_uWord) \
9383 : (uint8_t)0 )
9384
9385#ifdef IEM_WITHOUT_ASSEMBLY
9386
9387IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9388{
9389 RT_NOREF(pFpuState);
9390 RTUINT64U uSrc1 = { *puDst };
9391 RTUINT64U uSrc2 = { *puSrc };
9392 RTUINT64U uDst;
9393 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9394 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9395 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9396 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9397 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9398 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9399 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9400 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9401 *puDst = uDst.u;
9402}
9403
9404
9405IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9406{
9407 RT_NOREF(pFpuState);
9408 RTUINT128U uSrc1 = *puDst;
9409 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9410 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9411 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9412 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9413 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9414 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9415 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9416 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9417 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9418 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9419 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9420 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9421 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9422 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9423 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9424 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9425}
9426
9427#endif
9428
9429IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9430 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9431{
9432 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9433 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9434 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9435 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9436 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9437 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9438 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9439 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9440 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9441 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9442 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9443 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9444 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9445 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9446 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9447 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9448}
9449
9450IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9451 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9452{
9453 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9454 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9455 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9456 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9457 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9458 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9459 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9460 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9461 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9462 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9463 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9464 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9465 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9466 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9467 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9468 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9469 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9470 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9471 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9472 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9473 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9474 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9475 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9476 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9477 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9478 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9479 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9480 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9481 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9482 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9483 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9484 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9485}
9486
9487
9488/*
9489 * PSUBW / VPSUBW
9490 */
9491#ifdef IEM_WITHOUT_ASSEMBLY
9492
9493IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9494{
9495 RT_NOREF(pFpuState);
9496 RTUINT64U uSrc1 = { *puDst };
9497 RTUINT64U uSrc2 = { *puSrc };
9498 RTUINT64U uDst;
9499 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9500 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9501 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9502 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9503 *puDst = uDst.u;
9504}
9505
9506
9507IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9508{
9509 RT_NOREF(pFpuState);
9510 RTUINT128U uSrc1 = *puDst;
9511 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9512 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9513 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9514 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9515 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9516 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9517 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9518 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9519}
9520
9521#endif
9522
9523IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9524 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9525{
9526 RT_NOREF(pExtState);
9527 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9528 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9529 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9530 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9531 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9532 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9533 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9534 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9535}
9536
9537IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9538 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9539{
9540 RT_NOREF(pExtState);
9541 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9542 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9543 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9544 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9545 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9546 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9547 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9548 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9549 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9550 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9551 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9552 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9553 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9554 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9555 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9556 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9557}
9558
9559
9560/*
9561 * PSUBSW / VPSUBSW
9562 */
9563#ifdef IEM_WITHOUT_ASSEMBLY
9564
9565IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9566{
9567 RT_NOREF(pFpuState);
9568 RTUINT64U uSrc1 = { *puDst };
9569 RTUINT64U uSrc2 = { *puSrc };
9570 RTUINT64U uDst;
9571 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9572 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9573 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9574 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9575 *puDst = uDst.u;
9576}
9577
9578
9579IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9580{
9581 RT_NOREF(pFpuState);
9582 RTUINT128U uSrc1 = *puDst;
9583 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9584 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9585 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9586 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9587 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9588 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9589 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9590 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9591}
9592
9593#endif
9594
9595IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9596 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9597{
9598 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9599 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9600 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9601 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9602 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9603 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9604 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9605 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9606}
9607
9608IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9609 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9610{
9611 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9612 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9613 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9614 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9615 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9616 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9617 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9618 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9619 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9620 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9621 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9622 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9623 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9624 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9625 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9626 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9627}
9628
9629
9630/*
9631 * PSUBUSW / VPSUBUSW
9632 */
9633#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9634 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9635 ? (uint16_t)(a_uDword) \
9636 : (uint16_t)0 )
9637
9638#ifdef IEM_WITHOUT_ASSEMBLY
9639
9640IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9641{
9642 RT_NOREF(pFpuState);
9643 RTUINT64U uSrc1 = { *puDst };
9644 RTUINT64U uSrc2 = { *puSrc };
9645 RTUINT64U uDst;
9646 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9647 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9648 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9649 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9650 *puDst = uDst.u;
9651}
9652
9653
9654IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9655{
9656 RT_NOREF(pFpuState);
9657 RTUINT128U uSrc1 = *puDst;
9658 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9659 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9660 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9661 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9662 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9663 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9664 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9665 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9666}
9667
9668#endif
9669
9670IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9671 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9672{
9673 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9674 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9675 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9676 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9677 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9678 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9679 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9680 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9681}
9682
9683IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9684 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9685{
9686 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9687 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9688 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9689 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9690 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9691 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9692 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9693 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9694 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9695 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9696 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9697 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9698 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9699 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9700 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9701 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9702}
9703
9704
9705
9706/*
9707 * PSUBD / VPSUBD.
9708 */
9709#ifdef IEM_WITHOUT_ASSEMBLY
9710
9711IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9712{
9713 RT_NOREF(pFpuState);
9714 RTUINT64U uSrc1 = { *puDst };
9715 RTUINT64U uSrc2 = { *puSrc };
9716 RTUINT64U uDst;
9717 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9718 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9719 *puDst = uDst.u;
9720}
9721
9722
9723IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9724{
9725 RT_NOREF(pFpuState);
9726 RTUINT128U uSrc1 = *puDst;
9727 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9728 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9729 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9730 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9731}
9732
9733#endif /* IEM_WITHOUT_ASSEMBLY */
9734
9735IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9736 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9737{
9738 RT_NOREF(pExtState);
9739 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9740 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9741 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9742 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9743}
9744
9745IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9746 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9747{
9748 RT_NOREF(pExtState);
9749 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9750 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9751 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9752 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9753 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9754 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9755 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9756 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9757}
9758
9759
9760/*
9761 * PSUBQ / VPSUBQ.
9762 */
9763#ifdef IEM_WITHOUT_ASSEMBLY
9764
9765IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9766{
9767 RT_NOREF(pFpuState);
9768 *puDst = *puDst - *puSrc;
9769}
9770
9771IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9772{
9773 RT_NOREF(pFpuState);
9774 RTUINT128U uSrc1 = *puDst;
9775 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9776 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9777}
9778
9779#endif
9780
9781IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9782 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9783{
9784 RT_NOREF(pExtState);
9785 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9786 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9787}
9788
9789IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9790 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9791{
9792 RT_NOREF(pExtState);
9793 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9794 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9795 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9796 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9797}
9798
9799
9800
9801/*
9802 * PMULLW / VPMULLW / PMULLD / VPMULLD
9803 */
9804#ifdef IEM_WITHOUT_ASSEMBLY
9805
9806IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9807{
9808 RT_NOREF(pFpuState);
9809 RTUINT64U uSrc1 = { *puDst };
9810 RTUINT64U uSrc2 = { *puSrc };
9811 RTUINT64U uDst;
9812 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9813 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9814 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9815 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9816 *puDst = uDst.u;
9817}
9818
9819
9820IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9821{
9822 RT_NOREF(pFpuState);
9823 RTUINT128U uSrc1 = *puDst;
9824 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9825 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9826 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9827 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9828 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9829 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9830 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9831 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9832}
9833
9834#endif
9835
9836IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9837{
9838 RTUINT128U uSrc1 = *puDst;
9839
9840 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9841 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9842 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9843 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9844 RT_NOREF(pFpuState);
9845}
9846
9847
9848IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9849{
9850 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9851 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9852 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9853 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9854 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9855 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9856 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9857 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9858}
9859
9860
9861IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9862{
9863 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9864 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9865 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9866 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9867 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9868 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9869 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9870 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9871 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9872 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9873 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9874 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9875 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9876 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9877 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9878 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9879}
9880
9881
9882IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9883{
9884 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9885 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9886 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9887 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9888}
9889
9890
9891IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9892{
9893 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9894 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9895 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9896 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9897 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9898 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9899 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9900 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9901}
9902
9903
9904/*
9905 * PMULHW / VPMULHW
9906 */
9907#ifdef IEM_WITHOUT_ASSEMBLY
9908
9909IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9910{
9911 RT_NOREF(pFpuState);
9912 RTUINT64U uSrc1 = { *puDst };
9913 RTUINT64U uSrc2 = { *puSrc };
9914 RTUINT64U uDst;
9915 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9916 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9917 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9918 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9919 *puDst = uDst.u;
9920}
9921
9922
9923IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9924{
9925 RT_NOREF(pFpuState);
9926 RTUINT128U uSrc1 = *puDst;
9927 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9928 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9929 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9930 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9931 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9932 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9933 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9934 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9935}
9936
9937#endif
9938
9939IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9940{
9941 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9942 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9943 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9944 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9945 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9946 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9947 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9948 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9949}
9950
9951
9952IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9953{
9954 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9955 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9956 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9957 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9958 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9959 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9960 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9961 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9962 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9963 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9964 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9965 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9966 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9967 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9968 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9969 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9970}
9971
9972
9973/*
9974 * PMULHUW / VPMULHUW
9975 */
9976#ifdef IEM_WITHOUT_ASSEMBLY
9977
9978IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9979{
9980 RTUINT64U uSrc1 = { *puDst };
9981 RTUINT64U uSrc2 = { *puSrc };
9982 RTUINT64U uDst;
9983 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9984 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9985 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9986 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9987 *puDst = uDst.u;
9988}
9989
9990
9991IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9992{
9993 RTUINT128U uSrc1 = *puDst;
9994 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9995 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9996 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9997 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9998 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9999 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
10000 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
10001 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
10002}
10003
10004#endif
10005
10006IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10007{
10008 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
10009 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
10010 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
10011 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
10012 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
10013 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
10014 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
10015 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
10016}
10017
10018
10019IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10020{
10021 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
10022 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
10023 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
10024 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
10025 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
10026 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
10027 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
10028 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
10029 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
10030 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
10031 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
10032 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
10033 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
10034 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
10035 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
10036 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
10037}
10038
10039
10040/*
10041 * PSRLW / VPSRLW
10042 */
10043#ifdef IEM_WITHOUT_ASSEMBLY
10044
10045IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10046{
10047 RTUINT64U uSrc1 = { *puDst };
10048 RTUINT64U uSrc2 = { *puSrc };
10049 RTUINT64U uDst;
10050
10051 if (uSrc2.au64[0] <= 15)
10052 {
10053 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
10054 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
10055 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
10056 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
10057 }
10058 else
10059 {
10060 uDst.au64[0] = 0;
10061 }
10062 *puDst = uDst.u;
10063}
10064
10065
10066IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10067{
10068 RTUINT64U uSrc1 = { *puDst };
10069 RTUINT64U uDst;
10070
10071 if (uShift <= 15)
10072 {
10073 uDst.au16[0] = uSrc1.au16[0] >> uShift;
10074 uDst.au16[1] = uSrc1.au16[1] >> uShift;
10075 uDst.au16[2] = uSrc1.au16[2] >> uShift;
10076 uDst.au16[3] = uSrc1.au16[3] >> uShift;
10077 }
10078 else
10079 {
10080 uDst.au64[0] = 0;
10081 }
10082 *puDst = uDst.u;
10083}
10084
10085
10086IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10087{
10088 RTUINT128U uSrc1 = *puDst;
10089
10090 if (puSrc->au64[0] <= 15)
10091 {
10092 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
10093 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
10094 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
10095 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
10096 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
10097 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
10098 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
10099 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
10100 }
10101 else
10102 {
10103 puDst->au64[0] = 0;
10104 puDst->au64[1] = 0;
10105 }
10106}
10107
10108IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10109{
10110 RTUINT128U uSrc1 = *puDst;
10111
10112 if (uShift <= 15)
10113 {
10114 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10115 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10116 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10117 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10118 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10119 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10120 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10121 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10122 }
10123 else
10124 {
10125 puDst->au64[0] = 0;
10126 puDst->au64[1] = 0;
10127 }
10128}
10129
10130#endif
10131
10132IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10133{
10134 RTUINT128U uSrc1 = *puSrc1;
10135
10136 if (uShift <= 15)
10137 {
10138 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10139 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10140 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10141 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10142 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10143 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10144 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10145 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10146 }
10147 else
10148 {
10149 puDst->au64[0] = 0;
10150 puDst->au64[1] = 0;
10151 }
10152}
10153
10154IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10155{
10156 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10157}
10158
10159IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10160{
10161 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, uShift);
10162}
10163
10164IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10165{
10166 RTUINT256U uSrc1 = *puSrc1;
10167
10168 if (uShift <= 15)
10169 {
10170 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10171 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10172 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10173 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10174 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10175 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10176 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10177 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10178 puDst->au16[8] = uSrc1.au16[8] >> uShift;
10179 puDst->au16[9] = uSrc1.au16[9] >> uShift;
10180 puDst->au16[10] = uSrc1.au16[10] >> uShift;
10181 puDst->au16[11] = uSrc1.au16[11] >> uShift;
10182 puDst->au16[12] = uSrc1.au16[12] >> uShift;
10183 puDst->au16[13] = uSrc1.au16[13] >> uShift;
10184 puDst->au16[14] = uSrc1.au16[14] >> uShift;
10185 puDst->au16[15] = uSrc1.au16[15] >> uShift;
10186 }
10187 else
10188 {
10189 puDst->au64[0] = 0;
10190 puDst->au64[1] = 0;
10191 puDst->au64[2] = 0;
10192 puDst->au64[3] = 0;
10193 }
10194}
10195
10196IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10197{
10198 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, uShift);
10199}
10200
10201IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10202{
10203 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10204}
10205
10206
10207/*
10208 * PSRAW / VPSRAW
10209 */
10210#ifdef IEM_WITHOUT_ASSEMBLY
10211
10212IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10213{
10214 RTUINT64U uSrc1 = { *puDst };
10215 RTUINT64U uSrc2 = { *puSrc };
10216 RTUINT64U uDst;
10217 uint8_t uShift;
10218
10219 uShift = RT_MIN(15, uSrc2.au64[0]);
10220
10221 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10222 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10223 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10224 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10225
10226 *puDst = uDst.u;
10227}
10228
10229
10230IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10231{
10232 RTUINT64U uSrc1 = { *puDst };
10233 RTUINT64U uDst;
10234
10235 uShift = RT_MIN(15, uShift);
10236
10237 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10238 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10239 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10240 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10241
10242 *puDst = uDst.u;
10243}
10244
10245
10246IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10247{
10248 RTUINT128U uSrc1 = *puDst;
10249 uint8_t uShift;
10250
10251 uShift = RT_MIN(15, puSrc->au64[0]);
10252
10253 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10254 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10255 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10256 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10257 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10258 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10259 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10260 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10261}
10262
10263IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10264{
10265 RTUINT128U uSrc1 = *puDst;
10266
10267 uShift = RT_MIN(15, uShift);
10268
10269 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10270 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10271 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10272 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10273 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10274 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10275 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10276 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10277}
10278
10279#endif
10280
10281IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10282{
10283 RTUINT128U uSrc1 = *puSrc1;
10284
10285 uShift = RT_MIN(15, uShift);
10286
10287 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10288 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10289 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10290 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10291 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10292 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10293 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10294 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10295}
10296
10297IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10298{
10299 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10300}
10301
10302IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10303{
10304 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, uShift);
10305}
10306
10307IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10308{
10309 RTUINT256U uSrc1 = *puSrc1;
10310
10311 uShift = RT_MIN(15, uShift);
10312
10313 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10314 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10315 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10316 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10317 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10318 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10319 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10320 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10321 puDst->ai16[8] = uSrc1.ai16[8] >> uShift;
10322 puDst->ai16[9] = uSrc1.ai16[9] >> uShift;
10323 puDst->ai16[10] = uSrc1.ai16[10] >> uShift;
10324 puDst->ai16[11] = uSrc1.ai16[11] >> uShift;
10325 puDst->ai16[12] = uSrc1.ai16[12] >> uShift;
10326 puDst->ai16[13] = uSrc1.ai16[13] >> uShift;
10327 puDst->ai16[14] = uSrc1.ai16[14] >> uShift;
10328 puDst->ai16[15] = uSrc1.ai16[15] >> uShift;
10329}
10330
10331IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10332{
10333 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, uShift);
10334}
10335
10336IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10337{
10338 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10339}
10340
10341
10342/*
10343 * PSLLW / VPSLLW
10344 */
10345#ifdef IEM_WITHOUT_ASSEMBLY
10346
10347IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10348{
10349 RTUINT64U uSrc1 = { *puDst };
10350 RTUINT64U uSrc2 = { *puSrc };
10351 RTUINT64U uDst;
10352
10353 if (uSrc2.au64[0] <= 15)
10354 {
10355 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10356 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10357 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10358 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10359 }
10360 else
10361 {
10362 uDst.au64[0] = 0;
10363 }
10364 *puDst = uDst.u;
10365}
10366
10367
10368IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10369{
10370 RTUINT64U uSrc1 = { *puDst };
10371 RTUINT64U uDst;
10372
10373 if (uShift <= 15)
10374 {
10375 uDst.au16[0] = uSrc1.au16[0] << uShift;
10376 uDst.au16[1] = uSrc1.au16[1] << uShift;
10377 uDst.au16[2] = uSrc1.au16[2] << uShift;
10378 uDst.au16[3] = uSrc1.au16[3] << uShift;
10379 }
10380 else
10381 {
10382 uDst.au64[0] = 0;
10383 }
10384 *puDst = uDst.u;
10385}
10386
10387
10388IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10389{
10390 RTUINT128U uSrc1 = *puDst;
10391
10392 if (puSrc->au64[0] <= 15)
10393 {
10394 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10395 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10396 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10397 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10398 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10399 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10400 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10401 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10402 }
10403 else
10404 {
10405 puDst->au64[0] = 0;
10406 puDst->au64[1] = 0;
10407 }
10408}
10409
10410IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10411{
10412 RTUINT128U uSrc1 = *puDst;
10413
10414 if (uShift <= 15)
10415 {
10416 puDst->au16[0] = uSrc1.au16[0] << uShift;
10417 puDst->au16[1] = uSrc1.au16[1] << uShift;
10418 puDst->au16[2] = uSrc1.au16[2] << uShift;
10419 puDst->au16[3] = uSrc1.au16[3] << uShift;
10420 puDst->au16[4] = uSrc1.au16[4] << uShift;
10421 puDst->au16[5] = uSrc1.au16[5] << uShift;
10422 puDst->au16[6] = uSrc1.au16[6] << uShift;
10423 puDst->au16[7] = uSrc1.au16[7] << uShift;
10424 }
10425 else
10426 {
10427 puDst->au64[0] = 0;
10428 puDst->au64[1] = 0;
10429 }
10430}
10431
10432#endif
10433
10434IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10435{
10436 RTUINT128U uSrc1 = *puSrc1;
10437
10438 if (uShift <= 15)
10439 {
10440 puDst->au16[0] = uSrc1.au16[0] << uShift;
10441 puDst->au16[1] = uSrc1.au16[1] << uShift;
10442 puDst->au16[2] = uSrc1.au16[2] << uShift;
10443 puDst->au16[3] = uSrc1.au16[3] << uShift;
10444 puDst->au16[4] = uSrc1.au16[4] << uShift;
10445 puDst->au16[5] = uSrc1.au16[5] << uShift;
10446 puDst->au16[6] = uSrc1.au16[6] << uShift;
10447 puDst->au16[7] = uSrc1.au16[7] << uShift;
10448 }
10449 else
10450 {
10451 puDst->au64[0] = 0;
10452 puDst->au64[1] = 0;
10453 }
10454}
10455
10456IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10457{
10458 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10459}
10460
10461IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10462{
10463 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, uShift);
10464}
10465
10466IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10467{
10468 RTUINT256U uSrc1 = *puSrc1;
10469
10470 if (uShift <= 15)
10471 {
10472 puDst->au16[0] = uSrc1.au16[0] << uShift;
10473 puDst->au16[1] = uSrc1.au16[1] << uShift;
10474 puDst->au16[2] = uSrc1.au16[2] << uShift;
10475 puDst->au16[3] = uSrc1.au16[3] << uShift;
10476 puDst->au16[4] = uSrc1.au16[4] << uShift;
10477 puDst->au16[5] = uSrc1.au16[5] << uShift;
10478 puDst->au16[6] = uSrc1.au16[6] << uShift;
10479 puDst->au16[7] = uSrc1.au16[7] << uShift;
10480 puDst->au16[8] = uSrc1.au16[8] << uShift;
10481 puDst->au16[9] = uSrc1.au16[9] << uShift;
10482 puDst->au16[10] = uSrc1.au16[10] << uShift;
10483 puDst->au16[11] = uSrc1.au16[11] << uShift;
10484 puDst->au16[12] = uSrc1.au16[12] << uShift;
10485 puDst->au16[13] = uSrc1.au16[13] << uShift;
10486 puDst->au16[14] = uSrc1.au16[14] << uShift;
10487 puDst->au16[15] = uSrc1.au16[15] << uShift;
10488 }
10489 else
10490 {
10491 puDst->au64[0] = 0;
10492 puDst->au64[1] = 0;
10493 puDst->au64[2] = 0;
10494 puDst->au64[3] = 0;
10495 }
10496}
10497
10498IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10499{
10500 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10501}
10502
10503IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10504{
10505 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, uShift);
10506}
10507
10508/*
10509 * PSRLD / VPSRLD
10510 */
10511#ifdef IEM_WITHOUT_ASSEMBLY
10512
10513IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10514{
10515 RTUINT64U uSrc1 = { *puDst };
10516 RTUINT64U uSrc2 = { *puSrc };
10517 RTUINT64U uDst;
10518
10519 if (uSrc2.au64[0] <= 31)
10520 {
10521 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10522 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10523 }
10524 else
10525 {
10526 uDst.au64[0] = 0;
10527 }
10528 *puDst = uDst.u;
10529}
10530
10531
10532IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10533{
10534 RTUINT64U uSrc1 = { *puDst };
10535 RTUINT64U uDst;
10536
10537 if (uShift <= 31)
10538 {
10539 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10540 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10541 }
10542 else
10543 {
10544 uDst.au64[0] = 0;
10545 }
10546 *puDst = uDst.u;
10547}
10548
10549
10550IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10551{
10552 RTUINT128U uSrc1 = *puDst;
10553
10554 if (puSrc->au64[0] <= 31)
10555 {
10556 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10557 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10558 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10559 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10560 }
10561 else
10562 {
10563 puDst->au64[0] = 0;
10564 puDst->au64[1] = 0;
10565 }
10566}
10567
10568IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10569{
10570 RTUINT128U uSrc1 = *puDst;
10571
10572 if (uShift <= 31)
10573 {
10574 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10575 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10576 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10577 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10578 }
10579 else
10580 {
10581 puDst->au64[0] = 0;
10582 puDst->au64[1] = 0;
10583 }
10584}
10585
10586#endif
10587
10588IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10589{
10590 RTUINT128U uSrc1 = *puSrc1;
10591
10592 if (uShift <= 31)
10593 {
10594 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10595 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10596 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10597 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10598 }
10599 else
10600 {
10601 puDst->au64[0] = 0;
10602 puDst->au64[1] = 0;
10603 }
10604}
10605
10606IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10607{
10608 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, uShift);
10609}
10610
10611IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10612{
10613 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10614}
10615
10616IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10617{
10618 RTUINT256U uSrc1 = *puSrc1;
10619
10620 if (uShift <= 31)
10621 {
10622 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10623 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10624 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10625 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10626 puDst->au32[4] = uSrc1.au32[4] >> uShift;
10627 puDst->au32[5] = uSrc1.au32[5] >> uShift;
10628 puDst->au32[6] = uSrc1.au32[6] >> uShift;
10629 puDst->au32[7] = uSrc1.au32[7] >> uShift;
10630 }
10631 else
10632 {
10633 puDst->au64[0] = 0;
10634 puDst->au64[1] = 0;
10635 puDst->au64[2] = 0;
10636 puDst->au64[3] = 0;
10637 }
10638}
10639
10640IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10641{
10642 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10643}
10644
10645IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10646{
10647 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, uShift);
10648}
10649
10650
10651/*
10652 * PSRAD / VPSRAD
10653 */
10654#ifdef IEM_WITHOUT_ASSEMBLY
10655
10656IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10657{
10658 RTUINT64U uSrc1 = { *puDst };
10659 RTUINT64U uSrc2 = { *puSrc };
10660 RTUINT64U uDst;
10661 uint8_t uShift;
10662
10663 uShift = RT_MIN(31, uSrc2.au64[0]);
10664
10665 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10666 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10667
10668 *puDst = uDst.u;
10669}
10670
10671
10672IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10673{
10674 RTUINT64U uSrc1 = { *puDst };
10675 RTUINT64U uDst;
10676
10677 uShift = RT_MIN(31, uShift);
10678
10679 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10680 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10681
10682 *puDst = uDst.u;
10683}
10684
10685
10686IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10687{
10688 RTUINT128U uSrc1 = *puDst;
10689 uint8_t uShift;
10690
10691 uShift = RT_MIN(31, puSrc->au64[0]);
10692
10693 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10694 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10695 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10696 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10697}
10698
10699IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10700{
10701 RTUINT128U uSrc1 = *puDst;
10702
10703 uShift = RT_MIN(31, uShift);
10704
10705 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10706 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10707 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10708 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10709}
10710
10711#endif
10712
10713IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10714{
10715 RTUINT128U uSrc1 = *puSrc1;
10716
10717 uShift = RT_MIN(31, uShift);
10718
10719 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10720 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10721 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10722 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10723}
10724
10725IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10726{
10727 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, uShift);
10728}
10729
10730IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10731{
10732 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10733}
10734
10735IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10736{
10737 RTUINT256U uSrc1 = *puSrc1;
10738
10739 uShift = RT_MIN(31, uShift);
10740
10741 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10742 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10743 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10744 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10745 puDst->ai32[4] = uSrc1.ai32[4] >> uShift;
10746 puDst->ai32[5] = uSrc1.ai32[5] >> uShift;
10747 puDst->ai32[6] = uSrc1.ai32[6] >> uShift;
10748 puDst->ai32[7] = uSrc1.ai32[7] >> uShift;
10749}
10750
10751IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10752{
10753 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10754}
10755
10756IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10757{
10758 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, uShift);
10759}
10760
10761
10762/*
10763 * PSLLD / VPSLLD
10764 */
10765#ifdef IEM_WITHOUT_ASSEMBLY
10766
10767IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10768{
10769 RTUINT64U uSrc1 = { *puDst };
10770 RTUINT64U uSrc2 = { *puSrc };
10771 RTUINT64U uDst;
10772
10773 if (uSrc2.au64[0] <= 31)
10774 {
10775 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10776 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10777 }
10778 else
10779 {
10780 uDst.au64[0] = 0;
10781 }
10782 *puDst = uDst.u;
10783}
10784
10785
10786IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10787{
10788 RTUINT64U uSrc1 = { *puDst };
10789 RTUINT64U uDst;
10790
10791 if (uShift <= 31)
10792 {
10793 uDst.au32[0] = uSrc1.au32[0] << uShift;
10794 uDst.au32[1] = uSrc1.au32[1] << uShift;
10795 }
10796 else
10797 {
10798 uDst.au64[0] = 0;
10799 }
10800 *puDst = uDst.u;
10801}
10802
10803
10804IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10805{
10806 RTUINT128U uSrc1 = *puDst;
10807
10808 if (puSrc->au64[0] <= 31)
10809 {
10810 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10811 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10812 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10813 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10814 }
10815 else
10816 {
10817 puDst->au64[0] = 0;
10818 puDst->au64[1] = 0;
10819 }
10820}
10821
10822IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10823{
10824 RTUINT128U uSrc1 = *puDst;
10825
10826 if (uShift <= 31)
10827 {
10828 puDst->au32[0] = uSrc1.au32[0] << uShift;
10829 puDst->au32[1] = uSrc1.au32[1] << uShift;
10830 puDst->au32[2] = uSrc1.au32[2] << uShift;
10831 puDst->au32[3] = uSrc1.au32[3] << uShift;
10832 }
10833 else
10834 {
10835 puDst->au64[0] = 0;
10836 puDst->au64[1] = 0;
10837 }
10838}
10839
10840#endif
10841
10842IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10843{
10844 RTUINT128U uSrc1 = *puSrc1;
10845
10846 if (uShift <= 31)
10847 {
10848 puDst->au32[0] = uSrc1.au32[0] << uShift;
10849 puDst->au32[1] = uSrc1.au32[1] << uShift;
10850 puDst->au32[2] = uSrc1.au32[2] << uShift;
10851 puDst->au32[3] = uSrc1.au32[3] << uShift;
10852 }
10853 else
10854 {
10855 puDst->au64[0] = 0;
10856 puDst->au64[1] = 0;
10857 }
10858}
10859
10860IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10861{
10862 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, uShift);
10863}
10864
10865IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10866{
10867 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10868}
10869
10870IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10871{
10872 RTUINT256U uSrc1 = *puSrc1;
10873
10874 if (uShift <= 31)
10875 {
10876 puDst->au32[0] = uSrc1.au32[0] << uShift;
10877 puDst->au32[1] = uSrc1.au32[1] << uShift;
10878 puDst->au32[2] = uSrc1.au32[2] << uShift;
10879 puDst->au32[3] = uSrc1.au32[3] << uShift;
10880 puDst->au32[4] = uSrc1.au32[4] << uShift;
10881 puDst->au32[5] = uSrc1.au32[5] << uShift;
10882 puDst->au32[6] = uSrc1.au32[6] << uShift;
10883 puDst->au32[7] = uSrc1.au32[7] << uShift;
10884 }
10885 else
10886 {
10887 puDst->au64[0] = 0;
10888 puDst->au64[1] = 0;
10889 puDst->au64[2] = 0;
10890 puDst->au64[3] = 0;
10891 }
10892}
10893
10894IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10895{
10896 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10897}
10898
10899IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10900{
10901 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, uShift);
10902}
10903
10904
10905/*
10906 * PSRLQ / VPSRLQ
10907 */
10908#ifdef IEM_WITHOUT_ASSEMBLY
10909
10910IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10911{
10912 RTUINT64U uSrc1 = { *puDst };
10913 RTUINT64U uSrc2 = { *puSrc };
10914 RTUINT64U uDst;
10915
10916 if (uSrc2.au64[0] <= 63)
10917 {
10918 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10919 }
10920 else
10921 {
10922 uDst.au64[0] = 0;
10923 }
10924 *puDst = uDst.u;
10925}
10926
10927
10928IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10929{
10930 RTUINT64U uSrc1 = { *puDst };
10931 RTUINT64U uDst;
10932
10933 if (uShift <= 63)
10934 {
10935 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10936 }
10937 else
10938 {
10939 uDst.au64[0] = 0;
10940 }
10941 *puDst = uDst.u;
10942}
10943
10944
10945IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10946{
10947 RTUINT128U uSrc1 = *puDst;
10948
10949 if (puSrc->au64[0] <= 63)
10950 {
10951 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10952 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10953 }
10954 else
10955 {
10956 puDst->au64[0] = 0;
10957 puDst->au64[1] = 0;
10958 }
10959}
10960
10961IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10962{
10963 RTUINT128U uSrc1 = *puDst;
10964
10965 if (uShift <= 63)
10966 {
10967 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10968 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10969 }
10970 else
10971 {
10972 puDst->au64[0] = 0;
10973 puDst->au64[1] = 0;
10974 }
10975}
10976
10977#endif
10978
10979IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10980{
10981 RTUINT128U uSrc1 = *puSrc1;
10982
10983 if (uShift <= 63)
10984 {
10985 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10986 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10987 }
10988 else
10989 {
10990 puDst->au64[0] = 0;
10991 puDst->au64[1] = 0;
10992 }
10993}
10994
10995IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10996{
10997 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, uShift);
10998}
10999
11000IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11001{
11002 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11003}
11004
11005IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11006{
11007 RTUINT256U uSrc1 = *puSrc1;
11008
11009 if (uShift <= 63)
11010 {
11011 puDst->au64[0] = uSrc1.au64[0] >> uShift;
11012 puDst->au64[1] = uSrc1.au64[1] >> uShift;
11013 puDst->au64[2] = uSrc1.au64[2] >> uShift;
11014 puDst->au64[3] = uSrc1.au64[3] >> uShift;
11015 }
11016 else
11017 {
11018 puDst->au64[0] = 0;
11019 puDst->au64[1] = 0;
11020 puDst->au64[2] = 0;
11021 puDst->au64[3] = 0;
11022 }
11023}
11024
11025IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11026{
11027 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11028}
11029
11030IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11031{
11032 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, uShift);
11033}
11034
11035
11036/*
11037 * PSLLQ / VPSLLQ
11038 */
11039#ifdef IEM_WITHOUT_ASSEMBLY
11040
11041IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11042{
11043 RTUINT64U uSrc1 = { *puDst };
11044 RTUINT64U uSrc2 = { *puSrc };
11045 RTUINT64U uDst;
11046
11047 if (uSrc2.au64[0] <= 63)
11048 {
11049 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
11050 }
11051 else
11052 {
11053 uDst.au64[0] = 0;
11054 }
11055 *puDst = uDst.u;
11056}
11057
11058
11059IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
11060{
11061 RTUINT64U uSrc1 = { *puDst };
11062 RTUINT64U uDst;
11063
11064 if (uShift <= 63)
11065 {
11066 uDst.au64[0] = uSrc1.au64[0] << uShift;
11067 }
11068 else
11069 {
11070 uDst.au64[0] = 0;
11071 }
11072 *puDst = uDst.u;
11073}
11074
11075
11076IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11077{
11078 RTUINT128U uSrc1 = *puDst;
11079
11080 if (puSrc->au64[0] <= 63)
11081 {
11082 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
11083 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
11084 }
11085 else
11086 {
11087 puDst->au64[0] = 0;
11088 puDst->au64[1] = 0;
11089 }
11090}
11091
11092IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11093{
11094 RTUINT128U uSrc1 = *puDst;
11095
11096 if (uShift <= 63)
11097 {
11098 puDst->au64[0] = uSrc1.au64[0] << uShift;
11099 puDst->au64[1] = uSrc1.au64[1] << uShift;
11100 }
11101 else
11102 {
11103 puDst->au64[0] = 0;
11104 puDst->au64[1] = 0;
11105 }
11106}
11107
11108#endif
11109
11110IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11111{
11112 RTUINT128U uSrc1 = *puSrc1;
11113
11114 if (uShift <= 63)
11115 {
11116 puDst->au64[0] = uSrc1.au64[0] << uShift;
11117 puDst->au64[1] = uSrc1.au64[1] << uShift;
11118 }
11119 else
11120 {
11121 puDst->au64[0] = 0;
11122 puDst->au64[1] = 0;
11123 }
11124}
11125
11126IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11127{
11128 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11129}
11130
11131IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11132{
11133 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, uShift);
11134}
11135
11136IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11137{
11138 RTUINT256U uSrc1 = *puSrc1;
11139
11140 if (uShift <= 63)
11141 {
11142 puDst->au64[0] = uSrc1.au64[0] << uShift;
11143 puDst->au64[1] = uSrc1.au64[1] << uShift;
11144 puDst->au64[2] = uSrc1.au64[2] << uShift;
11145 puDst->au64[3] = uSrc1.au64[3] << uShift;
11146 }
11147 else
11148 {
11149 puDst->au64[0] = 0;
11150 puDst->au64[1] = 0;
11151 puDst->au64[2] = 0;
11152 puDst->au64[3] = 0;
11153 }
11154}
11155
11156IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11157{
11158 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11159}
11160
11161IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11162{
11163 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, uShift);
11164}
11165
11166
11167/*
11168 * PSRLDQ / VPSRLDQ
11169 */
11170#ifdef IEM_WITHOUT_ASSEMBLY
11171
11172IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11173{
11174 RTUINT128U uSrc1 = *puDst;
11175
11176 if (uShift < 16)
11177 {
11178 int i;
11179
11180 for (i = 0; i < 16 - uShift; ++i)
11181 puDst->au8[i] = uSrc1.au8[i + uShift];
11182 for (i = 16 - uShift; i < 16; ++i)
11183 puDst->au8[i] = 0;
11184 }
11185 else
11186 {
11187 puDst->au64[0] = 0;
11188 puDst->au64[1] = 0;
11189 }
11190}
11191
11192#endif
11193
11194
11195/*
11196 * PSLLDQ / VPSLLDQ
11197 */
11198#ifdef IEM_WITHOUT_ASSEMBLY
11199
11200IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11201{
11202 RTUINT128U uSrc1 = *puDst;
11203
11204 if (uShift < 16)
11205 {
11206 int i;
11207
11208 for (i = 0; i < uShift; ++i)
11209 puDst->au8[i] = 0;
11210 for (i = uShift; i < 16; ++i)
11211 puDst->au8[i] = uSrc1.au8[i - uShift];
11212 }
11213 else
11214 {
11215 puDst->au64[0] = 0;
11216 puDst->au64[1] = 0;
11217 }
11218}
11219
11220#endif
11221
11222
11223/*
11224 * VPSRLVD
11225 */
11226IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11227{
11228 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11229 {
11230 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11231 }
11232}
11233
11234IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11235{
11236 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11237 {
11238 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11239 }
11240}
11241
11242
11243/*
11244 * VPSRAVD
11245 */
11246IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11247{
11248 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11249 {
11250 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11251 }
11252}
11253
11254IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11255{
11256 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11257 {
11258 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11259 }
11260}
11261
11262
11263/*
11264 * VPSLLVD
11265 */
11266IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11267{
11268 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11269 {
11270 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11271 }
11272}
11273
11274IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11275{
11276 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11277 {
11278 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11279 }
11280}
11281
11282
11283/*
11284 * VPSRLVQ
11285 */
11286IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11287{
11288 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11289 {
11290 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11291 }
11292}
11293
11294IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11295{
11296 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11297 {
11298 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11299 }
11300}
11301
11302
11303/*
11304 * VPSLLVQ
11305 */
11306IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11307{
11308 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11309 {
11310 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11311 }
11312}
11313
11314IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11315{
11316 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11317 {
11318 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11319 }
11320}
11321
11322
11323/*
11324 * PMADDWD / VPMADDWD
11325 */
11326#ifdef IEM_WITHOUT_ASSEMBLY
11327
11328IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11329{
11330 RTUINT64U uSrc1 = { *puDst };
11331 RTUINT64U uSrc2 = { *puSrc };
11332 RTUINT64U uDst;
11333
11334 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11335 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11336 *puDst = uDst.u;
11337 RT_NOREF(pFpuState);
11338}
11339
11340
11341IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11342{
11343 RTUINT128U uSrc1 = *puDst;
11344
11345 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11346 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11347 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11348 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11349 RT_NOREF(pFpuState);
11350}
11351
11352#endif
11353
11354
11355IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11356{
11357 RTUINT64U uSrc1 = { *puDst };
11358 RTUINT64U uSrc2 = { *puSrc };
11359 RTUINT64U uDst;
11360
11361 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11362 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11363 *puDst = uDst.u;
11364 RT_NOREF(pFpuState);
11365}
11366
11367
11368IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11369{
11370 RTUINT128U uSrc1 = *puDst;
11371
11372 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11373 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11374 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11375 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11376 RT_NOREF(pFpuState);
11377}
11378
11379
11380IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11381{
11382 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11383 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11384 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11385 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11386}
11387
11388
11389IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11390{
11391 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11392 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11393 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11394 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11395 puDst->ai32[4] = (int32_t)puSrc1->ai16[8] * puSrc2->ai16[8] + (int32_t)puSrc1->ai16[9] * puSrc2->ai16[9];
11396 puDst->ai32[5] = (int32_t)puSrc1->ai16[10] * puSrc2->ai16[10] + (int32_t)puSrc1->ai16[11] * puSrc2->ai16[11];
11397 puDst->ai32[6] = (int32_t)puSrc1->ai16[12] * puSrc2->ai16[12] + (int32_t)puSrc1->ai16[13] * puSrc2->ai16[13];
11398 puDst->ai32[7] = (int32_t)puSrc1->ai16[14] * puSrc2->ai16[14] + (int32_t)puSrc1->ai16[15] * puSrc2->ai16[15];
11399}
11400
11401
11402/*
11403 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
11404 */
11405#ifdef IEM_WITHOUT_ASSEMBLY
11406
11407IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11408{
11409 RTUINT64U uSrc1 = { *puDst };
11410 RTUINT64U uSrc2 = { *puSrc };
11411 RTUINT64U uDst;
11412
11413 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
11414 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
11415 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
11416 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
11417 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
11418 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
11419 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
11420 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
11421 *puDst = uDst.u;
11422 RT_NOREF(pFpuState);
11423}
11424
11425
11426IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11427{
11428 RTUINT128U uSrc1 = *puDst;
11429
11430 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
11431 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
11432 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
11433 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
11434 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
11435 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
11436 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
11437 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
11438 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
11439 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
11440 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
11441 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
11442 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
11443 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
11444 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
11445 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
11446 RT_NOREF(pFpuState);
11447}
11448
11449#endif
11450
11451
11452IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11453{
11454 RTUINT128U uSrc1 = *puDst;
11455
11456 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
11457 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
11458 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
11459 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
11460 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
11461 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
11462 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
11463 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
11464 RT_NOREF(pFpuState);
11465}
11466
11467
11468IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11469{
11470 RTUINT128U uSrc1 = *puDst;
11471
11472 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
11473 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
11474 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
11475 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
11476 RT_NOREF(pFpuState);
11477}
11478
11479
11480IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11481 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11482{
11483 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11484 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11485 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11486 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11487 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11488 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11489 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11490 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11491 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11492 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11493 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11494 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11495 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11496 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11497 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11498 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11499 RT_NOREF(pExtState);
11500}
11501
11502
11503IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11504 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11505{
11506 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11507 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11508 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11509 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11510 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11511 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11512 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11513 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11514 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11515 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11516 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11517 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11518 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11519 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11520 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11521 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11522 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
11523 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
11524 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
11525 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
11526 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
11527 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
11528 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
11529 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
11530 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
11531 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
11532 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
11533 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
11534 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
11535 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
11536 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
11537 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
11538 RT_NOREF(pExtState);
11539}
11540
11541
11542IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11543 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11544{
11545 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11546 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11547 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11548 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11549 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11550 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11551 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11552 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11553 RT_NOREF(pExtState);
11554}
11555
11556
11557IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11558 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11559{
11560 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11561 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11562 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11563 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11564 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11565 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11566 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11567 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11568 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11569 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11570 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
11571 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
11572 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
11573 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
11574 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
11575 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
11576 RT_NOREF(pExtState);
11577}
11578
11579
11580IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11581 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11582{
11583 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11584 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11585 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11586 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11587 RT_NOREF(pExtState);
11588}
11589
11590
11591IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11592 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11593{
11594 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11595 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11596 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11597 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11598 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11599 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11600 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11601 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11602 RT_NOREF(pExtState);
11603}
11604
11605
11606/*
11607 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11608 */
11609#ifdef IEM_WITHOUT_ASSEMBLY
11610
11611IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11612{
11613 RTUINT64U uSrc1 = { *puDst };
11614 RTUINT64U uSrc2 = { *puSrc };
11615 RTUINT64U uDst;
11616
11617 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11618 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11619 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11620 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11621 *puDst = uDst.u;
11622 RT_NOREF(pFpuState);
11623}
11624
11625
11626IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11627{
11628 RTUINT128U uSrc1 = *puDst;
11629
11630 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11631 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11632 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11633 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11634 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11635 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11636 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11637 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11638 RT_NOREF(pFpuState);
11639}
11640
11641#endif
11642
11643IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11644{
11645 RTUINT128U uSrc1 = *puDst;
11646
11647 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11648 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11649 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11650 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11651 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11652 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11653 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11654 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11655 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11656 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11657 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11658 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11659 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11660 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11661 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11662 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11663 RT_NOREF(pFpuState);
11664}
11665
11666
11667IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11668{
11669 RTUINT128U uSrc1 = *puDst;
11670
11671 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11672 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11673 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11674 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11675 RT_NOREF(pFpuState);
11676}
11677
11678
11679IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11680 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11681{
11682 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11683 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11684 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11685 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11686 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11687 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11688 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11689 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11690 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11691 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11692 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11693 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11694 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11695 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11696 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11697 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11698 RT_NOREF(pExtState);
11699}
11700
11701
11702IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11703 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11704{
11705 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11706 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11707 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11708 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11709 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11710 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11711 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11712 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11713 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11714 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11715 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11716 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11717 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11718 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11719 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11720 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11721 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11722 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11723 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11724 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11725 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11726 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11727 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11728 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11729 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11730 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11731 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11732 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11733 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11734 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11735 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11736 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11737 RT_NOREF(pExtState);
11738}
11739
11740
11741IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11742 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11743{
11744 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11745 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11746 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11747 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11748 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11749 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11750 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11751 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11752 RT_NOREF(pExtState);
11753}
11754
11755
11756IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11757 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11758{
11759 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11760 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11761 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11762 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11763 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11764 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11765 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11766 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11767 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11768 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11769 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11770 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11771 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11772 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11773 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11774 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11775 RT_NOREF(pExtState);
11776}
11777
11778
11779IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11780 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11781{
11782 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11783 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11784 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11785 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11786 RT_NOREF(pExtState);
11787}
11788
11789
11790IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11791 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11792{
11793 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11794 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11795 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11796 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11797 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11798 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11799 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11800 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11801 RT_NOREF(pExtState);
11802}
11803
11804
11805/*
11806 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11807 */
11808#ifdef IEM_WITHOUT_ASSEMBLY
11809
11810IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11811{
11812 RTUINT64U uSrc1 = { *puDst };
11813 RTUINT64U uSrc2 = { *puSrc };
11814 RTUINT64U uDst;
11815
11816 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11817 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11818 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11819 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11820 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11821 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11822 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11823 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11824 *puDst = uDst.u;
11825 RT_NOREF(pFpuState);
11826}
11827
11828
11829IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11830{
11831 RTUINT128U uSrc1 = *puDst;
11832
11833 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11834 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11835 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11836 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11837 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11838 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11839 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11840 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11841 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11842 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11843 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11844 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11845 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11846 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11847 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11848 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11849 RT_NOREF(pFpuState);
11850}
11851
11852#endif
11853
11854IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11855{
11856 RTUINT128U uSrc1 = *puDst;
11857
11858 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11859 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11860 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11861 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11862 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11863 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11864 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11865 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11866 RT_NOREF(pFpuState);
11867}
11868
11869
11870IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11871{
11872 RTUINT128U uSrc1 = *puDst;
11873
11874 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11875 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11876 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11877 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11878 RT_NOREF(pFpuState);
11879}
11880
11881
11882IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11883 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11884{
11885 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11886 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11887 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11888 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11889 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11890 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11891 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11892 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11893 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11894 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11895 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11896 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11897 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11898 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11899 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11900 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11901 RT_NOREF(pExtState);
11902}
11903
11904
11905IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11906 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11907{
11908 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11909 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11910 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11911 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11912 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11913 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11914 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11915 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11916 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11917 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11918 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11919 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11920 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11921 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11922 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11923 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11924 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11925 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11926 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11927 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11928 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11929 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11930 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11931 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11932 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11933 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11934 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11935 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11936 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11937 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11938 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11939 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11940 RT_NOREF(pExtState);
11941}
11942
11943
11944IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11945 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11946{
11947 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11948 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11949 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11950 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11951 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11952 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11953 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11954 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11955 RT_NOREF(pExtState);
11956}
11957
11958
11959IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11960 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11961{
11962 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11963 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11964 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11965 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11966 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11967 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11968 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11969 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11970 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11971 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11972 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11973 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11974 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11975 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11976 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11977 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11978 RT_NOREF(pExtState);
11979}
11980
11981
11982IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11983 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11984{
11985 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11986 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11987 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11988 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11989 RT_NOREF(pExtState);
11990}
11991
11992
11993IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11994 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11995{
11996 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11997 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11998 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11999 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
12000 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
12001 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
12002 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
12003 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
12004 RT_NOREF(pExtState);
12005}
12006
12007
12008/*
12009 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
12010 */
12011#ifdef IEM_WITHOUT_ASSEMBLY
12012
12013IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12014{
12015 RTUINT64U uSrc1 = { *puDst };
12016 RTUINT64U uSrc2 = { *puSrc };
12017 RTUINT64U uDst;
12018
12019 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
12020 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
12021 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
12022 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
12023 *puDst = uDst.u;
12024 RT_NOREF(pFpuState);
12025}
12026
12027
12028IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12029{
12030 RTUINT128U uSrc1 = *puDst;
12031
12032 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
12033 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
12034 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
12035 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
12036 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
12037 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
12038 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
12039 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
12040 RT_NOREF(pFpuState);
12041}
12042
12043#endif
12044
12045IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12046{
12047 RTUINT128U uSrc1 = *puDst;
12048
12049 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
12050 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
12051 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
12052 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
12053 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
12054 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
12055 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
12056 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
12057 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
12058 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
12059 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
12060 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
12061 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
12062 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
12063 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
12064 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
12065 RT_NOREF(pFpuState);
12066}
12067
12068
12069IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12070{
12071 RTUINT128U uSrc1 = *puDst;
12072
12073 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
12074 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
12075 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
12076 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
12077 RT_NOREF(pFpuState);
12078}
12079
12080
12081IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12082 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12083{
12084 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12085 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12086 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12087 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12088 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12089 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12090 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12091 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12092 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12093 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12094 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12095 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12096 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12097 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12098 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12099 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12100 RT_NOREF(pExtState);
12101}
12102
12103
12104IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12105 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12106{
12107 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12108 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12109 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12110 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12111 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12112 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12113 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12114 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12115 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12116 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12117 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12118 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12119 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12120 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12121 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12122 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12123 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
12124 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
12125 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
12126 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
12127 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
12128 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
12129 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
12130 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
12131 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
12132 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
12133 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
12134 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
12135 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
12136 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
12137 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
12138 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
12139 RT_NOREF(pExtState);
12140}
12141
12142
12143IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12144 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12145{
12146 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12147 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12148 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12149 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12150 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12151 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12152 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12153 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12154 RT_NOREF(pExtState);
12155}
12156
12157
12158IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12159 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12160{
12161 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12162 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12163 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12164 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12165 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12166 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12167 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12168 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12169 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
12170 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
12171 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
12172 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
12173 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
12174 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
12175 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
12176 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
12177 RT_NOREF(pExtState);
12178}
12179
12180
12181IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12182 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12183{
12184 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12185 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12186 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12187 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12188 RT_NOREF(pExtState);
12189}
12190
12191
12192IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12193 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12194{
12195 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12196 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12197 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12198 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12199 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
12200 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
12201 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
12202 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
12203 RT_NOREF(pExtState);
12204}
12205
12206
12207/*
12208 * PAVGB / VPAVGB / PAVGW / VPAVGW
12209 */
12210#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
12211#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
12212
12213#ifdef IEM_WITHOUT_ASSEMBLY
12214
12215IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12216{
12217 RTUINT64U uSrc1 = { *puDst };
12218 RTUINT64U uSrc2 = { *puSrc };
12219 RTUINT64U uDst;
12220
12221 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
12222 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
12223 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
12224 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
12225 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
12226 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
12227 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
12228 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
12229 *puDst = uDst.u;
12230}
12231
12232
12233IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12234{
12235 RTUINT128U uSrc1 = *puDst;
12236
12237 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12238 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12239 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12240 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12241 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12242 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12243 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12244 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12245 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12246 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12247 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12248 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12249 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12250 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12251 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12252 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12253}
12254
12255
12256IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12257{
12258 RTUINT64U uSrc1 = { *puDst };
12259 RTUINT64U uSrc2 = { *puSrc };
12260 RTUINT64U uDst;
12261
12262 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
12263 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
12264 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
12265 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
12266 *puDst = uDst.u;
12267}
12268
12269
12270IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12271{
12272 RTUINT128U uSrc1 = *puDst;
12273
12274 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
12275 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
12276 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
12277 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
12278 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
12279 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
12280 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
12281 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
12282}
12283
12284#endif
12285
12286IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12287{
12288 RTUINT128U uSrc1 = *puDst;
12289
12290 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12291 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12292 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12293 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12294 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12295 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12296 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12297 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12298 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12299 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12300 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12301 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12302 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12303 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12304 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12305 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12306}
12307
12308
12309IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12310{
12311 RTUINT128U uSrc1 = *puDst;
12312
12313 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12314 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12315 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12316 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12317 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12318 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12319 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12320 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12321 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12322 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12323 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12324 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12325 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12326 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12327 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12328 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12329}
12330
12331
12332IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12333{
12334 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12335 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12336 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12337 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12338 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12339 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12340 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12341 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12342 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12343 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12344 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12345 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12346 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12347 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12348 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12349 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12350}
12351
12352
12353IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12354{
12355 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12356 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12357 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12358 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12359 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12360 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12361 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12362 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12363 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12364 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12365 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12366 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12367 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12368 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12369 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12370 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12371 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
12372 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
12373 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
12374 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
12375 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
12376 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
12377 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
12378 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
12379 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
12380 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
12381 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
12382 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
12383 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
12384 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
12385 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
12386 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
12387}
12388
12389
12390IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12391{
12392 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12393 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12394 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12395 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12396 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12397 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12398 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12399 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12400}
12401
12402
12403IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12404{
12405 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12406 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12407 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12408 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12409 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12410 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12411 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12412 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12413 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
12414 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
12415 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
12416 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
12417 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
12418 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
12419 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
12420 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
12421}
12422
12423#undef PAVGB_EXEC
12424#undef PAVGW_EXEC
12425
12426
12427/*
12428 * PMOVMSKB / VPMOVMSKB
12429 */
12430#ifdef IEM_WITHOUT_ASSEMBLY
12431
12432IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
12433{
12434 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12435 uint64_t const uSrc = *pu64Src;
12436 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
12437 | ((uSrc >> (15-1)) & RT_BIT_64(1))
12438 | ((uSrc >> (23-2)) & RT_BIT_64(2))
12439 | ((uSrc >> (31-3)) & RT_BIT_64(3))
12440 | ((uSrc >> (39-4)) & RT_BIT_64(4))
12441 | ((uSrc >> (47-5)) & RT_BIT_64(5))
12442 | ((uSrc >> (55-6)) & RT_BIT_64(6))
12443 | ((uSrc >> (63-7)) & RT_BIT_64(7));
12444}
12445
12446
12447IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
12448{
12449 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12450 uint64_t const uSrc0 = pu128Src->QWords.qw0;
12451 uint64_t const uSrc1 = pu128Src->QWords.qw1;
12452 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12453 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12454 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12455 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12456 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12457 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12458 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12459 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12460 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12461 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12462 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12463 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12464 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12465 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12466 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12467 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
12468}
12469
12470#endif
12471
12472IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
12473{
12474 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12475 uint64_t const uSrc0 = puSrc->QWords.qw0;
12476 uint64_t const uSrc1 = puSrc->QWords.qw1;
12477 uint64_t const uSrc2 = puSrc->QWords.qw2;
12478 uint64_t const uSrc3 = puSrc->QWords.qw3;
12479 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12480 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12481 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12482 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12483 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12484 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12485 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12486 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12487 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12488 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12489 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12490 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12491 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12492 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12493 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12494 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
12495 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
12496 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
12497 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
12498 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
12499 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
12500 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
12501 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
12502 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
12503 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
12504 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
12505 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
12506 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
12507 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
12508 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
12509 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
12510 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
12511}
12512
12513
12514/*
12515 * [V]PSHUFB
12516 */
12517
12518IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12519{
12520 RTUINT64U const uSrc = { *puSrc };
12521 RTUINT64U const uDstIn = { *puDst };
12522 ASMCompilerBarrier();
12523 RTUINT64U uDstOut = { 0 };
12524 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
12525 {
12526 uint8_t idxSrc = uSrc.au8[iByte];
12527 if (!(idxSrc & 0x80))
12528 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
12529 }
12530 *puDst = uDstOut.u;
12531 RT_NOREF(pFpuState);
12532}
12533
12534
12535IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12536{
12537 RTUINT128U const uSrc = *puSrc;
12538 RTUINT128U const uDstIn = *puDst;
12539 ASMCompilerBarrier();
12540 puDst->au64[0] = 0;
12541 puDst->au64[1] = 0;
12542 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12543 {
12544 uint8_t idxSrc = uSrc.au8[iByte];
12545 if (!(idxSrc & 0x80))
12546 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
12547 }
12548 RT_NOREF(pFpuState);
12549}
12550
12551
12552IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12553 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12554{
12555 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
12556 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
12557 ASMCompilerBarrier();
12558 puDst->au64[0] = 0;
12559 puDst->au64[1] = 0;
12560 for (unsigned iByte = 0; iByte < 16; iByte++)
12561 {
12562 uint8_t idxSrc = uSrc2.au8[iByte];
12563 if (!(idxSrc & 0x80))
12564 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12565 }
12566 RT_NOREF(pExtState);
12567}
12568
12569
12570IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12571 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12572{
12573 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
12574 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
12575 ASMCompilerBarrier();
12576 puDst->au64[0] = 0;
12577 puDst->au64[1] = 0;
12578 puDst->au64[2] = 0;
12579 puDst->au64[3] = 0;
12580 for (unsigned iByte = 0; iByte < 16; iByte++)
12581 {
12582 uint8_t idxSrc = uSrc2.au8[iByte];
12583 if (!(idxSrc & 0x80))
12584 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12585 }
12586 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12587 {
12588 uint8_t idxSrc = uSrc2.au8[iByte];
12589 if (!(idxSrc & 0x80))
12590 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
12591 }
12592 RT_NOREF(pExtState);
12593}
12594
12595
12596/*
12597 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
12598 */
12599#ifdef IEM_WITHOUT_ASSEMBLY
12600
12601IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
12602{
12603 uint64_t const uSrc = *puSrc;
12604 ASMCompilerBarrier();
12605 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12606 uSrc >> (((bEvil >> 2) & 3) * 16),
12607 uSrc >> (((bEvil >> 4) & 3) * 16),
12608 uSrc >> (((bEvil >> 6) & 3) * 16));
12609}
12610
12611
12612IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12613{
12614 puDst->QWords.qw0 = puSrc->QWords.qw0;
12615 uint64_t const uSrc = puSrc->QWords.qw1;
12616 ASMCompilerBarrier();
12617 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12618 uSrc >> (((bEvil >> 2) & 3) * 16),
12619 uSrc >> (((bEvil >> 4) & 3) * 16),
12620 uSrc >> (((bEvil >> 6) & 3) * 16));
12621}
12622
12623#endif
12624
12625IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12626{
12627 puDst->QWords.qw0 = puSrc->QWords.qw0;
12628 uint64_t const uSrc1 = puSrc->QWords.qw1;
12629 puDst->QWords.qw2 = puSrc->QWords.qw2;
12630 uint64_t const uSrc3 = puSrc->QWords.qw3;
12631 ASMCompilerBarrier();
12632 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12633 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12634 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12635 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12636 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12637 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12638 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12639 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12640}
12641
12642#ifdef IEM_WITHOUT_ASSEMBLY
12643IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12644{
12645 puDst->QWords.qw1 = puSrc->QWords.qw1;
12646 uint64_t const uSrc = puSrc->QWords.qw0;
12647 ASMCompilerBarrier();
12648 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12649 uSrc >> (((bEvil >> 2) & 3) * 16),
12650 uSrc >> (((bEvil >> 4) & 3) * 16),
12651 uSrc >> (((bEvil >> 6) & 3) * 16));
12652
12653}
12654#endif
12655
12656
12657IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12658{
12659 puDst->QWords.qw3 = puSrc->QWords.qw3;
12660 uint64_t const uSrc2 = puSrc->QWords.qw2;
12661 puDst->QWords.qw1 = puSrc->QWords.qw1;
12662 uint64_t const uSrc0 = puSrc->QWords.qw0;
12663 ASMCompilerBarrier();
12664 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12665 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12666 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12667 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12668 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12669 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12670 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12671 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12672
12673}
12674
12675
12676#ifdef IEM_WITHOUT_ASSEMBLY
12677IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12678{
12679 RTUINT128U const uSrc = *puSrc;
12680 ASMCompilerBarrier();
12681 puDst->au32[0] = uSrc.au32[bEvil & 3];
12682 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12683 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12684 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12685}
12686#endif
12687
12688
12689IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12690{
12691 RTUINT256U const uSrc = *puSrc;
12692 ASMCompilerBarrier();
12693 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12694 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12695 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12696 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12697 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12698 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12699 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12700 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12701}
12702
12703
12704/*
12705 * PUNPCKHBW - high bytes -> words
12706 */
12707#ifdef IEM_WITHOUT_ASSEMBLY
12708
12709IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12710{
12711 RTUINT64U const uSrc2 = { *puSrc };
12712 RTUINT64U const uSrc1 = { *puDst };
12713 ASMCompilerBarrier();
12714 RTUINT64U uDstOut;
12715 uDstOut.au8[0] = uSrc1.au8[4];
12716 uDstOut.au8[1] = uSrc2.au8[4];
12717 uDstOut.au8[2] = uSrc1.au8[5];
12718 uDstOut.au8[3] = uSrc2.au8[5];
12719 uDstOut.au8[4] = uSrc1.au8[6];
12720 uDstOut.au8[5] = uSrc2.au8[6];
12721 uDstOut.au8[6] = uSrc1.au8[7];
12722 uDstOut.au8[7] = uSrc2.au8[7];
12723 *puDst = uDstOut.u;
12724}
12725
12726
12727IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12728{
12729 RTUINT128U const uSrc2 = *puSrc;
12730 RTUINT128U const uSrc1 = *puDst;
12731 ASMCompilerBarrier();
12732 RTUINT128U uDstOut;
12733 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12734 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12735 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12736 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12737 uDstOut.au8[ 4] = uSrc1.au8[10];
12738 uDstOut.au8[ 5] = uSrc2.au8[10];
12739 uDstOut.au8[ 6] = uSrc1.au8[11];
12740 uDstOut.au8[ 7] = uSrc2.au8[11];
12741 uDstOut.au8[ 8] = uSrc1.au8[12];
12742 uDstOut.au8[ 9] = uSrc2.au8[12];
12743 uDstOut.au8[10] = uSrc1.au8[13];
12744 uDstOut.au8[11] = uSrc2.au8[13];
12745 uDstOut.au8[12] = uSrc1.au8[14];
12746 uDstOut.au8[13] = uSrc2.au8[14];
12747 uDstOut.au8[14] = uSrc1.au8[15];
12748 uDstOut.au8[15] = uSrc2.au8[15];
12749 *puDst = uDstOut;
12750}
12751
12752#endif
12753
12754IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12755{
12756 RTUINT128U const uSrc2 = *puSrc2;
12757 RTUINT128U const uSrc1 = *puSrc1;
12758 ASMCompilerBarrier();
12759 RTUINT128U uDstOut;
12760 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12761 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12762 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12763 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12764 uDstOut.au8[ 4] = uSrc1.au8[10];
12765 uDstOut.au8[ 5] = uSrc2.au8[10];
12766 uDstOut.au8[ 6] = uSrc1.au8[11];
12767 uDstOut.au8[ 7] = uSrc2.au8[11];
12768 uDstOut.au8[ 8] = uSrc1.au8[12];
12769 uDstOut.au8[ 9] = uSrc2.au8[12];
12770 uDstOut.au8[10] = uSrc1.au8[13];
12771 uDstOut.au8[11] = uSrc2.au8[13];
12772 uDstOut.au8[12] = uSrc1.au8[14];
12773 uDstOut.au8[13] = uSrc2.au8[14];
12774 uDstOut.au8[14] = uSrc1.au8[15];
12775 uDstOut.au8[15] = uSrc2.au8[15];
12776 *puDst = uDstOut;
12777}
12778
12779
12780IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12781{
12782 RTUINT256U const uSrc2 = *puSrc2;
12783 RTUINT256U const uSrc1 = *puSrc1;
12784 ASMCompilerBarrier();
12785 RTUINT256U uDstOut;
12786 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12787 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12788 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12789 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12790 uDstOut.au8[ 4] = uSrc1.au8[10];
12791 uDstOut.au8[ 5] = uSrc2.au8[10];
12792 uDstOut.au8[ 6] = uSrc1.au8[11];
12793 uDstOut.au8[ 7] = uSrc2.au8[11];
12794 uDstOut.au8[ 8] = uSrc1.au8[12];
12795 uDstOut.au8[ 9] = uSrc2.au8[12];
12796 uDstOut.au8[10] = uSrc1.au8[13];
12797 uDstOut.au8[11] = uSrc2.au8[13];
12798 uDstOut.au8[12] = uSrc1.au8[14];
12799 uDstOut.au8[13] = uSrc2.au8[14];
12800 uDstOut.au8[14] = uSrc1.au8[15];
12801 uDstOut.au8[15] = uSrc2.au8[15];
12802 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12803 uDstOut.au8[16] = uSrc1.au8[24];
12804 uDstOut.au8[17] = uSrc2.au8[24];
12805 uDstOut.au8[18] = uSrc1.au8[25];
12806 uDstOut.au8[19] = uSrc2.au8[25];
12807 uDstOut.au8[20] = uSrc1.au8[26];
12808 uDstOut.au8[21] = uSrc2.au8[26];
12809 uDstOut.au8[22] = uSrc1.au8[27];
12810 uDstOut.au8[23] = uSrc2.au8[27];
12811 uDstOut.au8[24] = uSrc1.au8[28];
12812 uDstOut.au8[25] = uSrc2.au8[28];
12813 uDstOut.au8[26] = uSrc1.au8[29];
12814 uDstOut.au8[27] = uSrc2.au8[29];
12815 uDstOut.au8[28] = uSrc1.au8[30];
12816 uDstOut.au8[29] = uSrc2.au8[30];
12817 uDstOut.au8[30] = uSrc1.au8[31];
12818 uDstOut.au8[31] = uSrc2.au8[31];
12819 *puDst = uDstOut;
12820}
12821
12822
12823/*
12824 * PUNPCKHBW - high words -> dwords
12825 */
12826#ifdef IEM_WITHOUT_ASSEMBLY
12827
12828IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12829{
12830 RTUINT64U const uSrc2 = { *puSrc };
12831 RTUINT64U const uSrc1 = { *puDst };
12832 ASMCompilerBarrier();
12833 RTUINT64U uDstOut;
12834 uDstOut.au16[0] = uSrc1.au16[2];
12835 uDstOut.au16[1] = uSrc2.au16[2];
12836 uDstOut.au16[2] = uSrc1.au16[3];
12837 uDstOut.au16[3] = uSrc2.au16[3];
12838 *puDst = uDstOut.u;
12839}
12840
12841
12842IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12843{
12844 RTUINT128U const uSrc2 = *puSrc;
12845 RTUINT128U const uSrc1 = *puDst;
12846 ASMCompilerBarrier();
12847 RTUINT128U uDstOut;
12848 uDstOut.au16[0] = uSrc1.au16[4];
12849 uDstOut.au16[1] = uSrc2.au16[4];
12850 uDstOut.au16[2] = uSrc1.au16[5];
12851 uDstOut.au16[3] = uSrc2.au16[5];
12852 uDstOut.au16[4] = uSrc1.au16[6];
12853 uDstOut.au16[5] = uSrc2.au16[6];
12854 uDstOut.au16[6] = uSrc1.au16[7];
12855 uDstOut.au16[7] = uSrc2.au16[7];
12856 *puDst = uDstOut;
12857}
12858
12859#endif
12860
12861IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12862{
12863 RTUINT128U const uSrc2 = *puSrc2;
12864 RTUINT128U const uSrc1 = *puSrc1;
12865 ASMCompilerBarrier();
12866 RTUINT128U uDstOut;
12867 uDstOut.au16[0] = uSrc1.au16[4];
12868 uDstOut.au16[1] = uSrc2.au16[4];
12869 uDstOut.au16[2] = uSrc1.au16[5];
12870 uDstOut.au16[3] = uSrc2.au16[5];
12871 uDstOut.au16[4] = uSrc1.au16[6];
12872 uDstOut.au16[5] = uSrc2.au16[6];
12873 uDstOut.au16[6] = uSrc1.au16[7];
12874 uDstOut.au16[7] = uSrc2.au16[7];
12875 *puDst = uDstOut;
12876}
12877
12878
12879IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12880{
12881 RTUINT256U const uSrc2 = *puSrc2;
12882 RTUINT256U const uSrc1 = *puSrc1;
12883 ASMCompilerBarrier();
12884 RTUINT256U uDstOut;
12885 uDstOut.au16[0] = uSrc1.au16[4];
12886 uDstOut.au16[1] = uSrc2.au16[4];
12887 uDstOut.au16[2] = uSrc1.au16[5];
12888 uDstOut.au16[3] = uSrc2.au16[5];
12889 uDstOut.au16[4] = uSrc1.au16[6];
12890 uDstOut.au16[5] = uSrc2.au16[6];
12891 uDstOut.au16[6] = uSrc1.au16[7];
12892 uDstOut.au16[7] = uSrc2.au16[7];
12893
12894 uDstOut.au16[8] = uSrc1.au16[12];
12895 uDstOut.au16[9] = uSrc2.au16[12];
12896 uDstOut.au16[10] = uSrc1.au16[13];
12897 uDstOut.au16[11] = uSrc2.au16[13];
12898 uDstOut.au16[12] = uSrc1.au16[14];
12899 uDstOut.au16[13] = uSrc2.au16[14];
12900 uDstOut.au16[14] = uSrc1.au16[15];
12901 uDstOut.au16[15] = uSrc2.au16[15];
12902 *puDst = uDstOut;
12903}
12904
12905
12906/*
12907 * PUNPCKHBW - high dwords -> qword(s)
12908 */
12909#ifdef IEM_WITHOUT_ASSEMBLY
12910
12911IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12912{
12913 RTUINT64U const uSrc2 = { *puSrc };
12914 RTUINT64U const uSrc1 = { *puDst };
12915 ASMCompilerBarrier();
12916 RTUINT64U uDstOut;
12917 uDstOut.au32[0] = uSrc1.au32[1];
12918 uDstOut.au32[1] = uSrc2.au32[1];
12919 *puDst = uDstOut.u;
12920}
12921
12922
12923IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12924{
12925 RTUINT128U const uSrc2 = *puSrc;
12926 RTUINT128U const uSrc1 = *puDst;
12927 ASMCompilerBarrier();
12928 RTUINT128U uDstOut;
12929 uDstOut.au32[0] = uSrc1.au32[2];
12930 uDstOut.au32[1] = uSrc2.au32[2];
12931 uDstOut.au32[2] = uSrc1.au32[3];
12932 uDstOut.au32[3] = uSrc2.au32[3];
12933 *puDst = uDstOut;
12934}
12935
12936#endif
12937
12938IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12939{
12940 RTUINT128U const uSrc2 = *puSrc2;
12941 RTUINT128U const uSrc1 = *puSrc1;
12942 ASMCompilerBarrier();
12943 RTUINT128U uDstOut;
12944 uDstOut.au32[0] = uSrc1.au32[2];
12945 uDstOut.au32[1] = uSrc2.au32[2];
12946 uDstOut.au32[2] = uSrc1.au32[3];
12947 uDstOut.au32[3] = uSrc2.au32[3];
12948 *puDst = uDstOut;
12949}
12950
12951
12952IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12953{
12954 RTUINT256U const uSrc2 = *puSrc2;
12955 RTUINT256U const uSrc1 = *puSrc1;
12956 ASMCompilerBarrier();
12957 RTUINT256U uDstOut;
12958 uDstOut.au32[0] = uSrc1.au32[2];
12959 uDstOut.au32[1] = uSrc2.au32[2];
12960 uDstOut.au32[2] = uSrc1.au32[3];
12961 uDstOut.au32[3] = uSrc2.au32[3];
12962
12963 uDstOut.au32[4] = uSrc1.au32[6];
12964 uDstOut.au32[5] = uSrc2.au32[6];
12965 uDstOut.au32[6] = uSrc1.au32[7];
12966 uDstOut.au32[7] = uSrc2.au32[7];
12967 *puDst = uDstOut;
12968}
12969
12970
12971/*
12972 * PUNPCKHQDQ -> High qwords -> double qword(s).
12973 */
12974#ifdef IEM_WITHOUT_ASSEMBLY
12975IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12976{
12977 RTUINT128U const uSrc2 = *puSrc;
12978 RTUINT128U const uSrc1 = *puDst;
12979 ASMCompilerBarrier();
12980 RTUINT128U uDstOut;
12981 uDstOut.au64[0] = uSrc1.au64[1];
12982 uDstOut.au64[1] = uSrc2.au64[1];
12983 *puDst = uDstOut;
12984}
12985#endif
12986
12987
12988IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12989{
12990 RTUINT128U const uSrc2 = *puSrc2;
12991 RTUINT128U const uSrc1 = *puSrc1;
12992 ASMCompilerBarrier();
12993 RTUINT128U uDstOut;
12994 uDstOut.au64[0] = uSrc1.au64[1];
12995 uDstOut.au64[1] = uSrc2.au64[1];
12996 *puDst = uDstOut;
12997}
12998
12999
13000IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13001{
13002 RTUINT256U const uSrc2 = *puSrc2;
13003 RTUINT256U const uSrc1 = *puSrc1;
13004 ASMCompilerBarrier();
13005 RTUINT256U uDstOut;
13006 uDstOut.au64[0] = uSrc1.au64[1];
13007 uDstOut.au64[1] = uSrc2.au64[1];
13008
13009 uDstOut.au64[2] = uSrc1.au64[3];
13010 uDstOut.au64[3] = uSrc2.au64[3];
13011 *puDst = uDstOut;
13012}
13013
13014
13015/*
13016 * PUNPCKLBW - low bytes -> words
13017 */
13018#ifdef IEM_WITHOUT_ASSEMBLY
13019
13020IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13021{
13022 RTUINT64U const uSrc2 = { *puSrc };
13023 RTUINT64U const uSrc1 = { *puDst };
13024 ASMCompilerBarrier();
13025 RTUINT64U uDstOut;
13026 uDstOut.au8[0] = uSrc1.au8[0];
13027 uDstOut.au8[1] = uSrc2.au8[0];
13028 uDstOut.au8[2] = uSrc1.au8[1];
13029 uDstOut.au8[3] = uSrc2.au8[1];
13030 uDstOut.au8[4] = uSrc1.au8[2];
13031 uDstOut.au8[5] = uSrc2.au8[2];
13032 uDstOut.au8[6] = uSrc1.au8[3];
13033 uDstOut.au8[7] = uSrc2.au8[3];
13034 *puDst = uDstOut.u;
13035}
13036
13037
13038IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13039{
13040 RTUINT128U const uSrc2 = *puSrc;
13041 RTUINT128U const uSrc1 = *puDst;
13042 ASMCompilerBarrier();
13043 RTUINT128U uDstOut;
13044 uDstOut.au8[ 0] = uSrc1.au8[0];
13045 uDstOut.au8[ 1] = uSrc2.au8[0];
13046 uDstOut.au8[ 2] = uSrc1.au8[1];
13047 uDstOut.au8[ 3] = uSrc2.au8[1];
13048 uDstOut.au8[ 4] = uSrc1.au8[2];
13049 uDstOut.au8[ 5] = uSrc2.au8[2];
13050 uDstOut.au8[ 6] = uSrc1.au8[3];
13051 uDstOut.au8[ 7] = uSrc2.au8[3];
13052 uDstOut.au8[ 8] = uSrc1.au8[4];
13053 uDstOut.au8[ 9] = uSrc2.au8[4];
13054 uDstOut.au8[10] = uSrc1.au8[5];
13055 uDstOut.au8[11] = uSrc2.au8[5];
13056 uDstOut.au8[12] = uSrc1.au8[6];
13057 uDstOut.au8[13] = uSrc2.au8[6];
13058 uDstOut.au8[14] = uSrc1.au8[7];
13059 uDstOut.au8[15] = uSrc2.au8[7];
13060 *puDst = uDstOut;
13061}
13062
13063#endif
13064
13065IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13066{
13067 RTUINT128U const uSrc2 = *puSrc2;
13068 RTUINT128U const uSrc1 = *puSrc1;
13069 ASMCompilerBarrier();
13070 RTUINT128U uDstOut;
13071 uDstOut.au8[ 0] = uSrc1.au8[0];
13072 uDstOut.au8[ 1] = uSrc2.au8[0];
13073 uDstOut.au8[ 2] = uSrc1.au8[1];
13074 uDstOut.au8[ 3] = uSrc2.au8[1];
13075 uDstOut.au8[ 4] = uSrc1.au8[2];
13076 uDstOut.au8[ 5] = uSrc2.au8[2];
13077 uDstOut.au8[ 6] = uSrc1.au8[3];
13078 uDstOut.au8[ 7] = uSrc2.au8[3];
13079 uDstOut.au8[ 8] = uSrc1.au8[4];
13080 uDstOut.au8[ 9] = uSrc2.au8[4];
13081 uDstOut.au8[10] = uSrc1.au8[5];
13082 uDstOut.au8[11] = uSrc2.au8[5];
13083 uDstOut.au8[12] = uSrc1.au8[6];
13084 uDstOut.au8[13] = uSrc2.au8[6];
13085 uDstOut.au8[14] = uSrc1.au8[7];
13086 uDstOut.au8[15] = uSrc2.au8[7];
13087 *puDst = uDstOut;
13088}
13089
13090
13091IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13092{
13093 RTUINT256U const uSrc2 = *puSrc2;
13094 RTUINT256U const uSrc1 = *puSrc1;
13095 ASMCompilerBarrier();
13096 RTUINT256U uDstOut;
13097 uDstOut.au8[ 0] = uSrc1.au8[0];
13098 uDstOut.au8[ 1] = uSrc2.au8[0];
13099 uDstOut.au8[ 2] = uSrc1.au8[1];
13100 uDstOut.au8[ 3] = uSrc2.au8[1];
13101 uDstOut.au8[ 4] = uSrc1.au8[2];
13102 uDstOut.au8[ 5] = uSrc2.au8[2];
13103 uDstOut.au8[ 6] = uSrc1.au8[3];
13104 uDstOut.au8[ 7] = uSrc2.au8[3];
13105 uDstOut.au8[ 8] = uSrc1.au8[4];
13106 uDstOut.au8[ 9] = uSrc2.au8[4];
13107 uDstOut.au8[10] = uSrc1.au8[5];
13108 uDstOut.au8[11] = uSrc2.au8[5];
13109 uDstOut.au8[12] = uSrc1.au8[6];
13110 uDstOut.au8[13] = uSrc2.au8[6];
13111 uDstOut.au8[14] = uSrc1.au8[7];
13112 uDstOut.au8[15] = uSrc2.au8[7];
13113 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
13114 uDstOut.au8[16] = uSrc1.au8[16];
13115 uDstOut.au8[17] = uSrc2.au8[16];
13116 uDstOut.au8[18] = uSrc1.au8[17];
13117 uDstOut.au8[19] = uSrc2.au8[17];
13118 uDstOut.au8[20] = uSrc1.au8[18];
13119 uDstOut.au8[21] = uSrc2.au8[18];
13120 uDstOut.au8[22] = uSrc1.au8[19];
13121 uDstOut.au8[23] = uSrc2.au8[19];
13122 uDstOut.au8[24] = uSrc1.au8[20];
13123 uDstOut.au8[25] = uSrc2.au8[20];
13124 uDstOut.au8[26] = uSrc1.au8[21];
13125 uDstOut.au8[27] = uSrc2.au8[21];
13126 uDstOut.au8[28] = uSrc1.au8[22];
13127 uDstOut.au8[29] = uSrc2.au8[22];
13128 uDstOut.au8[30] = uSrc1.au8[23];
13129 uDstOut.au8[31] = uSrc2.au8[23];
13130 *puDst = uDstOut;
13131}
13132
13133
13134/*
13135 * PUNPCKLBW - low words -> dwords
13136 */
13137#ifdef IEM_WITHOUT_ASSEMBLY
13138
13139IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
13140{
13141 RTUINT64U const uSrc2 = { *puSrc };
13142 RTUINT64U const uSrc1 = { *puDst };
13143 ASMCompilerBarrier();
13144 RTUINT64U uDstOut;
13145 uDstOut.au16[0] = uSrc1.au16[0];
13146 uDstOut.au16[1] = uSrc2.au16[0];
13147 uDstOut.au16[2] = uSrc1.au16[1];
13148 uDstOut.au16[3] = uSrc2.au16[1];
13149 *puDst = uDstOut.u;
13150}
13151
13152
13153IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13154{
13155 RTUINT128U const uSrc2 = *puSrc;
13156 RTUINT128U const uSrc1 = *puDst;
13157 ASMCompilerBarrier();
13158 RTUINT128U uDstOut;
13159 uDstOut.au16[0] = uSrc1.au16[0];
13160 uDstOut.au16[1] = uSrc2.au16[0];
13161 uDstOut.au16[2] = uSrc1.au16[1];
13162 uDstOut.au16[3] = uSrc2.au16[1];
13163 uDstOut.au16[4] = uSrc1.au16[2];
13164 uDstOut.au16[5] = uSrc2.au16[2];
13165 uDstOut.au16[6] = uSrc1.au16[3];
13166 uDstOut.au16[7] = uSrc2.au16[3];
13167 *puDst = uDstOut;
13168}
13169
13170#endif
13171
13172IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13173{
13174 RTUINT128U const uSrc2 = *puSrc2;
13175 RTUINT128U const uSrc1 = *puSrc1;
13176 ASMCompilerBarrier();
13177 RTUINT128U uDstOut;
13178 uDstOut.au16[0] = uSrc1.au16[0];
13179 uDstOut.au16[1] = uSrc2.au16[0];
13180 uDstOut.au16[2] = uSrc1.au16[1];
13181 uDstOut.au16[3] = uSrc2.au16[1];
13182 uDstOut.au16[4] = uSrc1.au16[2];
13183 uDstOut.au16[5] = uSrc2.au16[2];
13184 uDstOut.au16[6] = uSrc1.au16[3];
13185 uDstOut.au16[7] = uSrc2.au16[3];
13186 *puDst = uDstOut;
13187}
13188
13189
13190IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13191{
13192 RTUINT256U const uSrc2 = *puSrc2;
13193 RTUINT256U const uSrc1 = *puSrc1;
13194 ASMCompilerBarrier();
13195 RTUINT256U uDstOut;
13196 uDstOut.au16[0] = uSrc1.au16[0];
13197 uDstOut.au16[1] = uSrc2.au16[0];
13198 uDstOut.au16[2] = uSrc1.au16[1];
13199 uDstOut.au16[3] = uSrc2.au16[1];
13200 uDstOut.au16[4] = uSrc1.au16[2];
13201 uDstOut.au16[5] = uSrc2.au16[2];
13202 uDstOut.au16[6] = uSrc1.au16[3];
13203 uDstOut.au16[7] = uSrc2.au16[3];
13204
13205 uDstOut.au16[8] = uSrc1.au16[8];
13206 uDstOut.au16[9] = uSrc2.au16[8];
13207 uDstOut.au16[10] = uSrc1.au16[9];
13208 uDstOut.au16[11] = uSrc2.au16[9];
13209 uDstOut.au16[12] = uSrc1.au16[10];
13210 uDstOut.au16[13] = uSrc2.au16[10];
13211 uDstOut.au16[14] = uSrc1.au16[11];
13212 uDstOut.au16[15] = uSrc2.au16[11];
13213 *puDst = uDstOut;
13214}
13215
13216
13217/*
13218 * PUNPCKLBW - low dwords -> qword(s)
13219 */
13220#ifdef IEM_WITHOUT_ASSEMBLY
13221
13222IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
13223{
13224 RTUINT64U const uSrc2 = { *puSrc };
13225 RTUINT64U const uSrc1 = { *puDst };
13226 ASMCompilerBarrier();
13227 RTUINT64U uDstOut;
13228 uDstOut.au32[0] = uSrc1.au32[0];
13229 uDstOut.au32[1] = uSrc2.au32[0];
13230 *puDst = uDstOut.u;
13231}
13232
13233
13234IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13235{
13236 RTUINT128U const uSrc2 = *puSrc;
13237 RTUINT128U const uSrc1 = *puDst;
13238 ASMCompilerBarrier();
13239 RTUINT128U uDstOut;
13240 uDstOut.au32[0] = uSrc1.au32[0];
13241 uDstOut.au32[1] = uSrc2.au32[0];
13242 uDstOut.au32[2] = uSrc1.au32[1];
13243 uDstOut.au32[3] = uSrc2.au32[1];
13244 *puDst = uDstOut;
13245}
13246
13247#endif
13248
13249IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13250{
13251 RTUINT128U const uSrc2 = *puSrc2;
13252 RTUINT128U const uSrc1 = *puSrc1;
13253 ASMCompilerBarrier();
13254 RTUINT128U uDstOut;
13255 uDstOut.au32[0] = uSrc1.au32[0];
13256 uDstOut.au32[1] = uSrc2.au32[0];
13257 uDstOut.au32[2] = uSrc1.au32[1];
13258 uDstOut.au32[3] = uSrc2.au32[1];
13259 *puDst = uDstOut;
13260}
13261
13262
13263IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13264{
13265 RTUINT256U const uSrc2 = *puSrc2;
13266 RTUINT256U const uSrc1 = *puSrc1;
13267 ASMCompilerBarrier();
13268 RTUINT256U uDstOut;
13269 uDstOut.au32[0] = uSrc1.au32[0];
13270 uDstOut.au32[1] = uSrc2.au32[0];
13271 uDstOut.au32[2] = uSrc1.au32[1];
13272 uDstOut.au32[3] = uSrc2.au32[1];
13273
13274 uDstOut.au32[4] = uSrc1.au32[4];
13275 uDstOut.au32[5] = uSrc2.au32[4];
13276 uDstOut.au32[6] = uSrc1.au32[5];
13277 uDstOut.au32[7] = uSrc2.au32[5];
13278 *puDst = uDstOut;
13279}
13280
13281
13282/*
13283 * PUNPCKLQDQ -> Low qwords -> double qword(s).
13284 */
13285#ifdef IEM_WITHOUT_ASSEMBLY
13286IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13287{
13288 RTUINT128U const uSrc2 = *puSrc;
13289 RTUINT128U const uSrc1 = *puDst;
13290 ASMCompilerBarrier();
13291 RTUINT128U uDstOut;
13292 uDstOut.au64[0] = uSrc1.au64[0];
13293 uDstOut.au64[1] = uSrc2.au64[0];
13294 *puDst = uDstOut;
13295}
13296#endif
13297
13298
13299IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13300{
13301 RTUINT128U const uSrc2 = *puSrc2;
13302 RTUINT128U const uSrc1 = *puSrc1;
13303 ASMCompilerBarrier();
13304 RTUINT128U uDstOut;
13305 uDstOut.au64[0] = uSrc1.au64[0];
13306 uDstOut.au64[1] = uSrc2.au64[0];
13307 *puDst = uDstOut;
13308}
13309
13310
13311IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13312{
13313 RTUINT256U const uSrc2 = *puSrc2;
13314 RTUINT256U const uSrc1 = *puSrc1;
13315 ASMCompilerBarrier();
13316 RTUINT256U uDstOut;
13317 uDstOut.au64[0] = uSrc1.au64[0];
13318 uDstOut.au64[1] = uSrc2.au64[0];
13319
13320 uDstOut.au64[2] = uSrc1.au64[2];
13321 uDstOut.au64[3] = uSrc2.au64[2];
13322 *puDst = uDstOut;
13323}
13324
13325
13326/*
13327 * PACKSSWB - signed words -> signed bytes
13328 */
13329
13330#ifdef IEM_WITHOUT_ASSEMBLY
13331
13332IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13333{
13334 RTUINT64U const uSrc2 = { *puSrc };
13335 RTUINT64U const uSrc1 = { *puDst };
13336 ASMCompilerBarrier();
13337 RTUINT64U uDstOut;
13338 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13339 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13340 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13341 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13342 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13343 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13344 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13345 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13346 *puDst = uDstOut.u;
13347}
13348
13349
13350IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13351{
13352 RTUINT128U const uSrc2 = *puSrc;
13353 RTUINT128U const uSrc1 = *puDst;
13354 ASMCompilerBarrier();
13355 RTUINT128U uDstOut;
13356 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13357 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13358 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13359 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13360 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13361 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13362 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13363 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13364 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13365 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13366 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13367 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13368 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13369 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13370 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13371 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13372 *puDst = uDstOut;
13373}
13374
13375#endif
13376
13377IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13378{
13379 RTUINT128U const uSrc2 = *puSrc2;
13380 RTUINT128U const uSrc1 = *puSrc1;
13381 ASMCompilerBarrier();
13382 RTUINT128U uDstOut;
13383 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13384 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13385 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13386 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13387 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13388 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13389 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13390 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13391 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13392 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13393 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13394 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13395 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13396 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13397 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13398 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13399 *puDst = uDstOut;
13400}
13401
13402
13403IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13404{
13405 RTUINT256U const uSrc2 = *puSrc2;
13406 RTUINT256U const uSrc1 = *puSrc1;
13407 ASMCompilerBarrier();
13408 RTUINT256U uDstOut;
13409 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13410 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13411 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13412 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13413 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13414 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13415 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13416 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13417 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13418 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13419 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13420 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13421 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13422 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13423 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13424 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13425
13426 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
13427 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
13428 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
13429 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
13430 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
13431 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
13432 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
13433 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
13434 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
13435 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
13436 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
13437 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
13438 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
13439 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
13440 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
13441 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
13442 *puDst = uDstOut;
13443}
13444
13445
13446/*
13447 * PACKUSWB - signed words -> unsigned bytes
13448 */
13449#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
13450 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
13451 ? (uint8_t)(a_iWord) \
13452 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
13453
13454#ifdef IEM_WITHOUT_ASSEMBLY
13455
13456IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13457{
13458 RTUINT64U const uSrc2 = { *puSrc };
13459 RTUINT64U const uSrc1 = { *puDst };
13460 ASMCompilerBarrier();
13461 RTUINT64U uDstOut;
13462 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13463 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13464 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13465 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13466 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13467 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13468 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13469 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13470 *puDst = uDstOut.u;
13471}
13472
13473
13474IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13475{
13476 RTUINT128U const uSrc2 = *puSrc;
13477 RTUINT128U const uSrc1 = *puDst;
13478 ASMCompilerBarrier();
13479 RTUINT128U uDstOut;
13480 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13481 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13482 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13483 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13484 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13485 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13486 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13487 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13488 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13489 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13490 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13491 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13492 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13493 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13494 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13495 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13496 *puDst = uDstOut;
13497}
13498
13499#endif
13500
13501IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13502{
13503 RTUINT128U const uSrc2 = *puSrc2;
13504 RTUINT128U const uSrc1 = *puSrc1;
13505 ASMCompilerBarrier();
13506 RTUINT128U uDstOut;
13507 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13508 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13509 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13510 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13511 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13512 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13513 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13514 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13515 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13516 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13517 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13518 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13519 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13520 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13521 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13522 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13523 *puDst = uDstOut;
13524}
13525
13526
13527IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13528{
13529 RTUINT256U const uSrc2 = *puSrc2;
13530 RTUINT256U const uSrc1 = *puSrc1;
13531 ASMCompilerBarrier();
13532 RTUINT256U uDstOut;
13533 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13534 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13535 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13536 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13537 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13538 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13539 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13540 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13541 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13542 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13543 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13544 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13545 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13546 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13547 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13548 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13549
13550 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
13551 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
13552 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
13553 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
13554 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
13555 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
13556 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
13557 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
13558 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
13559 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
13560 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
13561 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
13562 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
13563 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
13564 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
13565 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
13566 *puDst = uDstOut;
13567}
13568
13569
13570/*
13571 * PACKSSDW - signed dwords -> signed words
13572 */
13573
13574#ifdef IEM_WITHOUT_ASSEMBLY
13575
13576IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13577{
13578 RTUINT64U const uSrc2 = { *puSrc };
13579 RTUINT64U const uSrc1 = { *puDst };
13580 ASMCompilerBarrier();
13581 RTUINT64U uDstOut;
13582 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13583 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13584 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13585 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13586 *puDst = uDstOut.u;
13587}
13588
13589
13590IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13591{
13592 RTUINT128U const uSrc2 = *puSrc;
13593 RTUINT128U const uSrc1 = *puDst;
13594 ASMCompilerBarrier();
13595 RTUINT128U uDstOut;
13596 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13597 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13598 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13599 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13600 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13601 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13602 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13603 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13604 *puDst = uDstOut;
13605}
13606
13607#endif
13608
13609IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13610{
13611 RTUINT128U const uSrc2 = *puSrc2;
13612 RTUINT128U const uSrc1 = *puSrc1;
13613 ASMCompilerBarrier();
13614 RTUINT128U uDstOut;
13615 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13616 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13617 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13618 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13619 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13620 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13621 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13622 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13623 *puDst = uDstOut;
13624}
13625
13626
13627IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13628{
13629 RTUINT256U const uSrc2 = *puSrc2;
13630 RTUINT256U const uSrc1 = *puSrc1;
13631 ASMCompilerBarrier();
13632 RTUINT256U uDstOut;
13633 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13634 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13635 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13636 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13637 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13638 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13639 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13640 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13641
13642 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13643 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13644 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13645 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13646 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13647 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13648 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13649 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13650 *puDst = uDstOut;
13651}
13652
13653
13654/*
13655 * PACKUSDW - signed dwords -> unsigned words
13656 */
13657#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13658 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13659 ? (uint16_t)(a_iDword) \
13660 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13661
13662#ifdef IEM_WITHOUT_ASSEMBLY
13663IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13664{
13665 RTUINT128U const uSrc2 = *puSrc;
13666 RTUINT128U const uSrc1 = *puDst;
13667 ASMCompilerBarrier();
13668 RTUINT128U uDstOut;
13669 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13670 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13671 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13672 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13673 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13674 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13675 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13676 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13677 *puDst = uDstOut;
13678}
13679#endif
13680
13681IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13682{
13683 RTUINT128U const uSrc2 = *puSrc2;
13684 RTUINT128U const uSrc1 = *puSrc1;
13685 ASMCompilerBarrier();
13686 RTUINT128U uDstOut;
13687 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13688 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13689 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13690 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13691 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13692 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13693 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13694 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13695 *puDst = uDstOut;
13696}
13697
13698
13699IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13700{
13701 RTUINT256U const uSrc2 = *puSrc2;
13702 RTUINT256U const uSrc1 = *puSrc1;
13703 ASMCompilerBarrier();
13704 RTUINT256U uDstOut;
13705 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13706 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13707 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13708 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13709 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13710 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13711 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13712 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13713
13714 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13715 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13716 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13717 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13718 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13719 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13720 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13721 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13722 *puDst = uDstOut;
13723}
13724
13725
13726/*
13727 * [V]PABSB / [V]PABSW / [V]PABSD
13728 */
13729
13730IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13731{
13732 RTUINT64U const uSrc = { *puSrc };
13733 RTUINT64U uDstOut = { 0 };
13734
13735 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13736 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13737 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13738 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13739 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13740 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13741 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13742 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13743 *puDst = uDstOut.u;
13744 RT_NOREF(pFpuState);
13745}
13746
13747
13748IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13749{
13750 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13751 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13752 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13753 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13754 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13755 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13756 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13757 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13758 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13759 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13760 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13761 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13762 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13763 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13764 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13765 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13766 RT_NOREF(pFpuState);
13767}
13768
13769
13770IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13771{
13772 RTUINT64U const uSrc = { *puSrc };
13773 RTUINT64U uDstOut = { 0 };
13774
13775 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13776 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13777 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13778 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13779 *puDst = uDstOut.u;
13780 RT_NOREF(pFpuState);
13781}
13782
13783
13784IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13785{
13786 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13787 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13788 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13789 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13790 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13791 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13792 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13793 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13794 RT_NOREF(pFpuState);
13795}
13796
13797
13798IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13799{
13800 RTUINT64U const uSrc = { *puSrc };
13801 RTUINT64U uDstOut = { 0 };
13802
13803 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13804 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13805 *puDst = uDstOut.u;
13806 RT_NOREF(pFpuState);
13807}
13808
13809
13810IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13811{
13812 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13813 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13814 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13815 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13816 RT_NOREF(pFpuState);
13817}
13818
13819
13820IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13821{
13822 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13823 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13824 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13825 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13826 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13827 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13828 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13829 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13830 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13831 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13832 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13833 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13834 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13835 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13836 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13837 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13838}
13839
13840
13841IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13842{
13843 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13844 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13845 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13846 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13847 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13848 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13849 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13850 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13851 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13852 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13853 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13854 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13855 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13856 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13857 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13858 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13859 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13860 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13861 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13862 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13863 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13864 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13865 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13866 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13867 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13868 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13869 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13870 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13871 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13872 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13873 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13874 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13875}
13876
13877
13878IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13879{
13880 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13881 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13882 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13883 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13884 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13885 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13886 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13887 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13888}
13889
13890
13891IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13892{
13893 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13894 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13895 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13896 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13897 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13898 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13899 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13900 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13901 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13902 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13903 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13904 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13905 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13906 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13907 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13908 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13909}
13910
13911
13912IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13913{
13914 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13915 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13916 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13917 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13918}
13919
13920
13921IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13922{
13923 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13924 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13925 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13926 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13927 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13928 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13929 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13930 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13931}
13932
13933
13934/*
13935 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13936 */
13937IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13938{
13939 RTUINT64U uSrc1 = { *puDst };
13940 RTUINT64U uSrc2 = { *puSrc };
13941 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13942
13943 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13944 {
13945 if (uSrc2.ai8[i] < 0)
13946 uDst.ai8[i] = -uSrc1.ai8[i];
13947 else if (uSrc2.ai8[i] == 0)
13948 uDst.ai8[i] = 0;
13949 else /* uSrc2.ai8[i] > 0 */
13950 uDst.ai8[i] = uSrc1.ai8[i];
13951 }
13952
13953 *puDst = uDst.u;
13954 RT_NOREF(pFpuState);
13955}
13956
13957
13958IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13959{
13960 RTUINT128U uSrc1 = *puDst;
13961
13962 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13963 {
13964 if (puSrc->ai8[i] < 0)
13965 puDst->ai8[i] = -uSrc1.ai8[i];
13966 else if (puSrc->ai8[i] == 0)
13967 puDst->ai8[i] = 0;
13968 else /* puSrc->ai8[i] > 0 */
13969 puDst->ai8[i] = uSrc1.ai8[i];
13970 }
13971
13972 RT_NOREF(pFpuState);
13973}
13974
13975
13976IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13977{
13978 RTUINT64U uSrc1 = { *puDst };
13979 RTUINT64U uSrc2 = { *puSrc };
13980 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13981
13982 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13983 {
13984 if (uSrc2.ai16[i] < 0)
13985 uDst.ai16[i] = -uSrc1.ai16[i];
13986 else if (uSrc2.ai16[i] == 0)
13987 uDst.ai16[i] = 0;
13988 else /* uSrc2.ai16[i] > 0 */
13989 uDst.ai16[i] = uSrc1.ai16[i];
13990 }
13991
13992 *puDst = uDst.u;
13993 RT_NOREF(pFpuState);
13994}
13995
13996
13997IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13998{
13999 RTUINT128U uSrc1 = *puDst;
14000
14001 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14002 {
14003 if (puSrc->ai16[i] < 0)
14004 puDst->ai16[i] = -uSrc1.ai16[i];
14005 else if (puSrc->ai16[i] == 0)
14006 puDst->ai16[i] = 0;
14007 else /* puSrc->ai16[i] > 0 */
14008 puDst->ai16[i] = uSrc1.ai16[i];
14009 }
14010
14011 RT_NOREF(pFpuState);
14012}
14013
14014
14015IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14016{
14017 RTUINT64U uSrc1 = { *puDst };
14018 RTUINT64U uSrc2 = { *puSrc };
14019 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14020
14021 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
14022 {
14023 if (uSrc2.ai32[i] < 0)
14024 uDst.ai32[i] = -uSrc1.ai32[i];
14025 else if (uSrc2.ai32[i] == 0)
14026 uDst.ai32[i] = 0;
14027 else /* uSrc2.ai32[i] > 0 */
14028 uDst.ai32[i] = uSrc1.ai32[i];
14029 }
14030
14031 *puDst = uDst.u;
14032 RT_NOREF(pFpuState);
14033}
14034
14035
14036IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14037{
14038 RTUINT128U uSrc1 = *puDst;
14039
14040 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14041 {
14042 if (puSrc->ai32[i] < 0)
14043 puDst->ai32[i] = -uSrc1.ai32[i];
14044 else if (puSrc->ai32[i] == 0)
14045 puDst->ai32[i] = 0;
14046 else /* puSrc->ai32[i] > 0 */
14047 puDst->ai32[i] = uSrc1.ai32[i];
14048 }
14049
14050 RT_NOREF(pFpuState);
14051}
14052
14053
14054IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14055{
14056 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
14057 {
14058 if (puSrc2->ai8[i] < 0)
14059 puDst->ai8[i] = -puSrc1->ai8[i];
14060 else if (puSrc2->ai8[i] == 0)
14061 puDst->ai8[i] = 0;
14062 else /* puSrc2->ai8[i] > 0 */
14063 puDst->ai8[i] = puSrc1->ai8[i];
14064 }
14065}
14066
14067
14068IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14069{
14070 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
14071 {
14072 if (puSrc2->ai8[i] < 0)
14073 puDst->ai8[i] = -puSrc1->ai8[i];
14074 else if (puSrc2->ai8[i] == 0)
14075 puDst->ai8[i] = 0;
14076 else /* puSrc2->ai8[i] > 0 */
14077 puDst->ai8[i] = puSrc1->ai8[i];
14078 }
14079}
14080
14081
14082IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14083{
14084 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14085 {
14086 if (puSrc2->ai16[i] < 0)
14087 puDst->ai16[i] = -puSrc1->ai16[i];
14088 else if (puSrc2->ai16[i] == 0)
14089 puDst->ai16[i] = 0;
14090 else /* puSrc2->ai16[i] > 0 */
14091 puDst->ai16[i] = puSrc1->ai16[i];
14092 }
14093}
14094
14095
14096IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14097{
14098 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
14099 {
14100 if (puSrc2->ai16[i] < 0)
14101 puDst->ai16[i] = -puSrc1->ai16[i];
14102 else if (puSrc2->ai16[i] == 0)
14103 puDst->ai16[i] = 0;
14104 else /* puSrc2->ai16[i] > 0 */
14105 puDst->ai16[i] = puSrc1->ai16[i];
14106 }
14107}
14108
14109
14110IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14111{
14112 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14113 {
14114 if (puSrc2->ai32[i] < 0)
14115 puDst->ai32[i] = -puSrc1->ai32[i];
14116 else if (puSrc2->ai32[i] == 0)
14117 puDst->ai32[i] = 0;
14118 else /* puSrc2->ai32[i] > 0 */
14119 puDst->ai32[i] = puSrc1->ai32[i];
14120 }
14121}
14122
14123
14124IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14125{
14126 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14127 {
14128 if (puSrc2->ai32[i] < 0)
14129 puDst->ai32[i] = -puSrc1->ai32[i];
14130 else if (puSrc2->ai32[i] == 0)
14131 puDst->ai32[i] = 0;
14132 else /* puSrc2->ai32[i] > 0 */
14133 puDst->ai32[i] = puSrc1->ai32[i];
14134 }
14135}
14136
14137
14138/*
14139 * PHADDW / VPHADDW / PHADDD / VPHADDD
14140 */
14141IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14142{
14143 RTUINT64U uSrc1 = { *puDst };
14144 RTUINT64U uSrc2 = { *puSrc };
14145 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14146
14147 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14148 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14149 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
14150 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
14151 *puDst = uDst.u;
14152 RT_NOREF(pFpuState);
14153}
14154
14155
14156IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14157{
14158 RTUINT128U uSrc1 = *puDst;
14159
14160 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14161 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14162 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
14163 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
14164
14165 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
14166 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
14167 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
14168 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
14169 RT_NOREF(pFpuState);
14170}
14171
14172
14173IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14174{
14175 RTUINT64U uSrc1 = { *puDst };
14176 RTUINT64U uSrc2 = { *puSrc };
14177 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14178
14179 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14180 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
14181 *puDst = uDst.u;
14182 RT_NOREF(pFpuState);
14183}
14184
14185
14186IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14187{
14188 RTUINT128U uSrc1 = *puDst;
14189
14190 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14191 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
14192
14193 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
14194 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
14195 RT_NOREF(pFpuState);
14196}
14197
14198
14199IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14200{
14201 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14202
14203 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
14204 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
14205 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
14206 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
14207
14208 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
14209 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
14210 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
14211 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
14212
14213 puDst->au64[0] = uDst.au64[0];
14214 puDst->au64[1] = uDst.au64[1];
14215}
14216
14217
14218IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14219{
14220 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14221
14222 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
14223 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
14224 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
14225 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
14226 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
14227 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
14228 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
14229 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
14230
14231 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
14232 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
14233 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
14234 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
14235 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
14236 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
14237 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
14238 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
14239
14240 puDst->au64[0] = uDst.au64[0];
14241 puDst->au64[1] = uDst.au64[1];
14242 puDst->au64[2] = uDst.au64[2];
14243 puDst->au64[3] = uDst.au64[3];
14244}
14245
14246
14247IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14248{
14249 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14250
14251 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
14252 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
14253
14254 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
14255 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
14256
14257 puDst->au64[0] = uDst.au64[0];
14258 puDst->au64[1] = uDst.au64[1];
14259}
14260
14261
14262IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14263{
14264 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14265
14266 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
14267 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
14268 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
14269 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
14270
14271 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
14272 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
14273 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
14274 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
14275
14276 puDst->au64[0] = uDst.au64[0];
14277 puDst->au64[1] = uDst.au64[1];
14278 puDst->au64[2] = uDst.au64[2];
14279 puDst->au64[3] = uDst.au64[3];
14280}
14281
14282
14283/*
14284 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
14285 */
14286IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14287{
14288 RTUINT64U uSrc1 = { *puDst };
14289 RTUINT64U uSrc2 = { *puSrc };
14290 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14291
14292 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14293 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14294 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
14295 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
14296 *puDst = uDst.u;
14297 RT_NOREF(pFpuState);
14298}
14299
14300
14301IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14302{
14303 RTUINT128U uSrc1 = *puDst;
14304
14305 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14306 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14307 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
14308 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
14309
14310 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
14311 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
14312 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
14313 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
14314 RT_NOREF(pFpuState);
14315}
14316
14317
14318IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14319{
14320 RTUINT64U uSrc1 = { *puDst };
14321 RTUINT64U uSrc2 = { *puSrc };
14322 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14323
14324 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14325 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
14326 *puDst = uDst.u;
14327 RT_NOREF(pFpuState);
14328}
14329
14330
14331IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14332{
14333 RTUINT128U uSrc1 = *puDst;
14334
14335 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14336 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
14337
14338 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
14339 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
14340 RT_NOREF(pFpuState);
14341}
14342
14343
14344IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14345{
14346 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14347
14348 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
14349 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
14350 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
14351 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
14352
14353 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
14354 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
14355 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
14356 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
14357
14358 puDst->au64[0] = uDst.au64[0];
14359 puDst->au64[1] = uDst.au64[1];
14360}
14361
14362
14363IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14364{
14365 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14366
14367 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
14368 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
14369 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
14370 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
14371 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
14372 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
14373 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
14374 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
14375
14376 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
14377 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
14378 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
14379 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
14380 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
14381 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
14382 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
14383 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
14384
14385 puDst->au64[0] = uDst.au64[0];
14386 puDst->au64[1] = uDst.au64[1];
14387 puDst->au64[2] = uDst.au64[2];
14388 puDst->au64[3] = uDst.au64[3];
14389}
14390
14391
14392IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14393{
14394 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14395
14396 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
14397 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
14398
14399 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
14400 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
14401
14402 puDst->au64[0] = uDst.au64[0];
14403 puDst->au64[1] = uDst.au64[1];
14404}
14405
14406
14407IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14408{
14409 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14410
14411 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
14412 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
14413 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
14414 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
14415
14416 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
14417 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
14418 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
14419 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
14420
14421 puDst->au64[0] = uDst.au64[0];
14422 puDst->au64[1] = uDst.au64[1];
14423 puDst->au64[2] = uDst.au64[2];
14424 puDst->au64[3] = uDst.au64[3];
14425}
14426
14427
14428/*
14429 * PHADDSW / VPHADDSW
14430 */
14431IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14432{
14433 RTUINT64U uSrc1 = { *puDst };
14434 RTUINT64U uSrc2 = { *puSrc };
14435 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14436
14437 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14438 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14439 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
14440 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
14441 *puDst = uDst.u;
14442 RT_NOREF(pFpuState);
14443}
14444
14445
14446IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14447{
14448 RTUINT128U uSrc1 = *puDst;
14449
14450 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14451 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14452 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
14453 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
14454
14455 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
14456 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
14457 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
14458 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
14459 RT_NOREF(pFpuState);
14460}
14461
14462
14463IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14464{
14465 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14466
14467 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
14468 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
14469 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
14470 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
14471
14472 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
14473 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
14474 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
14475 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
14476
14477 puDst->au64[0] = uDst.au64[0];
14478 puDst->au64[1] = uDst.au64[1];
14479}
14480
14481
14482IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14483{
14484 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14485
14486 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
14487 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
14488 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
14489 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
14490 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
14491 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
14492 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
14493 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
14494
14495 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
14496 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
14497 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
14498 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
14499 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
14500 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
14501 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
14502 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
14503
14504 puDst->au64[0] = uDst.au64[0];
14505 puDst->au64[1] = uDst.au64[1];
14506 puDst->au64[2] = uDst.au64[2];
14507 puDst->au64[3] = uDst.au64[3];
14508}
14509
14510
14511/*
14512 * PHSUBSW / VPHSUBSW
14513 */
14514IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14515{
14516 RTUINT64U uSrc1 = { *puDst };
14517 RTUINT64U uSrc2 = { *puSrc };
14518 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14519
14520 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14521 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14522 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
14523 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
14524 *puDst = uDst.u;
14525 RT_NOREF(pFpuState);
14526}
14527
14528
14529IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14530{
14531 RTUINT128U uSrc1 = *puDst;
14532
14533 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14534 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14535 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
14536 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
14537
14538 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
14539 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
14540 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
14541 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
14542 RT_NOREF(pFpuState);
14543}
14544
14545
14546IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14547{
14548 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14549
14550 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
14551 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
14552 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
14553 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
14554
14555 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
14556 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
14557 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
14558 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
14559
14560 puDst->au64[0] = uDst.au64[0];
14561 puDst->au64[1] = uDst.au64[1];
14562}
14563
14564
14565IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14566{
14567 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14568
14569 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
14570 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
14571 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
14572 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
14573 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
14574 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
14575 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
14576 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
14577
14578 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
14579 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
14580 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
14581 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
14582 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
14583 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
14584 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
14585 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
14586
14587 puDst->au64[0] = uDst.au64[0];
14588 puDst->au64[1] = uDst.au64[1];
14589 puDst->au64[2] = uDst.au64[2];
14590 puDst->au64[3] = uDst.au64[3];
14591}
14592
14593
14594/*
14595 * PMADDUBSW / VPMADDUBSW
14596 */
14597IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14598{
14599 RTUINT64U uSrc1 = { *puDst };
14600 RTUINT64U uSrc2 = { *puSrc };
14601 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14602
14603 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14604 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14605 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14606 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14607 *puDst = uDst.u;
14608 RT_NOREF(pFpuState);
14609}
14610
14611
14612IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14613{
14614 RTUINT128U uSrc1 = *puDst;
14615
14616 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14617 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14618 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14619 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14620 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14621 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14622 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14623 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14624 RT_NOREF(pFpuState);
14625}
14626
14627
14628IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14629{
14630 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14631
14632 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14633 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14634 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14635 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14636 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14637 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14638 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14639 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14640
14641 puDst->au64[0] = uDst.au64[0];
14642 puDst->au64[1] = uDst.au64[1];
14643}
14644
14645
14646IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14647{
14648 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14649
14650 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14651 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14652 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14653 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14654 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14655 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14656 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14657 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14658 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14659 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14660 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14661 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14662 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14663 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14664 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14665 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14666
14667 puDst->au64[0] = uDst.au64[0];
14668 puDst->au64[1] = uDst.au64[1];
14669 puDst->au64[2] = uDst.au64[2];
14670 puDst->au64[3] = uDst.au64[3];
14671}
14672
14673
14674/*
14675 * PMULHRSW / VPMULHRSW
14676 */
14677#define DO_PMULHRSW(a_Src1, a_Src2) \
14678 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14679
14680IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14681{
14682 RTUINT64U uSrc1 = { *puDst };
14683 RTUINT64U uSrc2 = { *puSrc };
14684 RTUINT64U uDst;
14685
14686 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14687 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14688 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14689 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14690 *puDst = uDst.u;
14691 RT_NOREF(pFpuState);
14692}
14693
14694
14695IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14696{
14697 RTUINT128U uSrc1 = *puDst;
14698
14699 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14700 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14701 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14702 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14703 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14704 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14705 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14706 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14707 RT_NOREF(pFpuState);
14708}
14709
14710
14711IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14712{
14713 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14714
14715 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14716 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14717 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14718 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14719 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14720 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14721 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14722 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14723
14724 puDst->au64[0] = uDst.au64[0];
14725 puDst->au64[1] = uDst.au64[1];
14726}
14727
14728
14729IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14730{
14731 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14732
14733 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14734 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14735 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14736 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14737 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14738 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14739 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14740 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14741 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14742 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14743 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14744 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14745 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14746 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14747 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14748 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14749
14750 puDst->au64[0] = uDst.au64[0];
14751 puDst->au64[1] = uDst.au64[1];
14752 puDst->au64[2] = uDst.au64[2];
14753 puDst->au64[3] = uDst.au64[3];
14754}
14755
14756
14757/*
14758 * PSADBW / VPSADBW
14759 */
14760#ifdef IEM_WITHOUT_ASSEMBLY
14761
14762IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14763{
14764 RTUINT64U uSrc1 = { *puDst };
14765 RTUINT64U uSrc2 = { *puSrc };
14766 RTUINT64U uDst;
14767 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14768 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14769 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14770 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14771 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14772 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14773 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14774 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14775
14776 uDst.au64[0] = 0;
14777 uDst.au16[0] = uSum;
14778 *puDst = uDst.u;
14779}
14780
14781
14782IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14783{
14784 RTUINT128U uSrc1 = *puDst;
14785
14786 puDst->au64[0] = 0;
14787 puDst->au64[1] = 0;
14788
14789 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14790 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14791 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14792 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14793 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14794 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14795 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14796 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14797 puDst->au16[0] = uSum;
14798
14799 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14800 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14801 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14802 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14803 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14804 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14805 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14806 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14807 puDst->au16[4] = uSum;
14808}
14809
14810#endif
14811
14812IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14813{
14814 RTUINT128U uSrc1 = *puSrc1;
14815 RTUINT128U uSrc2 = *puSrc2;
14816
14817 puDst->au64[0] = 0;
14818 puDst->au64[1] = 0;
14819
14820 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14821 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14822 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14823 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14824 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14825 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14826 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14827 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14828 puDst->au16[0] = uSum;
14829
14830 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14831 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14832 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14833 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14834 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14835 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14836 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14837 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14838 puDst->au16[4] = uSum;
14839}
14840
14841IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14842{
14843 RTUINT256U uSrc1 = *puSrc1;
14844 RTUINT256U uSrc2 = *puSrc2;
14845
14846 puDst->au64[0] = 0;
14847 puDst->au64[1] = 0;
14848 puDst->au64[2] = 0;
14849 puDst->au64[3] = 0;
14850
14851 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14852 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14853 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14854 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14855 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14856 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14857 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14858 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14859 puDst->au16[0] = uSum;
14860
14861 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14862 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14863 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14864 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14865 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14866 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14867 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14868 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14869 puDst->au16[4] = uSum;
14870
14871 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14872 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14873 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14874 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14875 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14876 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14877 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14878 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14879 puDst->au16[8] = uSum;
14880
14881 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14882 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14883 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14884 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14885 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14886 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14887 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14888 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14889 puDst->au16[12] = uSum;
14890}
14891
14892
14893/*
14894 * PMULDQ / VPMULDQ
14895 */
14896IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14897{
14898 RTUINT128U uSrc1 = *puDst;
14899
14900 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14901 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14902}
14903
14904IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14905{
14906 RTUINT128U uSrc1 = *puSrc1;
14907 RTUINT128U uSrc2 = *puSrc2;
14908
14909 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14910 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14911}
14912
14913IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14914{
14915 RTUINT256U uSrc1 = *puSrc1;
14916 RTUINT256U uSrc2 = *puSrc2;
14917
14918 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14919 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14920 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14921 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14922}
14923
14924
14925/*
14926 * PMULUDQ / VPMULUDQ
14927 */
14928#ifdef IEM_WITHOUT_ASSEMBLY
14929
14930IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14931{
14932 RTUINT64U uSrc1 = { *puDst };
14933 RTUINT64U uSrc2 = { *puSrc };
14934 ASMCompilerBarrier();
14935 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14936 RT_NOREF(pFpuState);
14937}
14938
14939
14940IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14941{
14942 RTUINT128U uSrc1 = *puDst;
14943 RTUINT128U uSrc2 = *puSrc;
14944 ASMCompilerBarrier();
14945 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14946 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14947 RT_NOREF(pFpuState);
14948}
14949
14950#endif
14951
14952IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14953{
14954 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14955 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14956 ASMCompilerBarrier();
14957 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14958 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14959}
14960
14961
14962IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14963{
14964 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14965 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14966 ASMCompilerBarrier();
14967 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14968 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14969 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14970 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14971}
14972
14973
14974/*
14975 * UNPCKLPS / VUNPCKLPS
14976 */
14977#ifdef IEM_WITHOUT_ASSEMBLY
14978IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14979{
14980 RTUINT128U uSrc1 = *puDst;
14981 RTUINT128U uSrc2 = *puSrc;
14982 ASMCompilerBarrier();
14983 puDst->au32[0] = uSrc1.au32[0];
14984 puDst->au32[1] = uSrc2.au32[0];
14985 puDst->au32[2] = uSrc1.au32[1];
14986 puDst->au32[3] = uSrc2.au32[1];
14987}
14988
14989#endif
14990
14991IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14992{
14993 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14994 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14995 ASMCompilerBarrier();
14996 puDst->au32[0] = uSrc1.au32[0];
14997 puDst->au32[1] = uSrc2.au32[0];
14998 puDst->au32[2] = uSrc1.au32[1];
14999 puDst->au32[3] = uSrc2.au32[1];
15000}
15001
15002
15003IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15004{
15005 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15006 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15007 ASMCompilerBarrier();
15008 puDst->au32[0] = uSrc1.au32[0];
15009 puDst->au32[1] = uSrc2.au32[0];
15010 puDst->au32[2] = uSrc1.au32[1];
15011 puDst->au32[3] = uSrc2.au32[1];
15012
15013 puDst->au32[4] = uSrc1.au32[4];
15014 puDst->au32[5] = uSrc2.au32[4];
15015 puDst->au32[6] = uSrc1.au32[5];
15016 puDst->au32[7] = uSrc2.au32[5];
15017}
15018
15019
15020/*
15021 * UNPCKLPD / VUNPCKLPD
15022 */
15023#ifdef IEM_WITHOUT_ASSEMBLY
15024IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15025{
15026 RTUINT128U uSrc1 = *puDst;
15027 RTUINT128U uSrc2 = *puSrc;
15028 ASMCompilerBarrier();
15029 puDst->au64[0] = uSrc1.au64[0];
15030 puDst->au64[1] = uSrc2.au64[0];
15031}
15032
15033#endif
15034
15035IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15036{
15037 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15038 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15039 ASMCompilerBarrier();
15040 puDst->au64[0] = uSrc1.au64[0];
15041 puDst->au64[1] = uSrc2.au64[0];
15042}
15043
15044
15045IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15046{
15047 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15048 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15049 ASMCompilerBarrier();
15050 puDst->au64[0] = uSrc1.au64[0];
15051 puDst->au64[1] = uSrc2.au64[0];
15052 puDst->au64[2] = uSrc1.au64[2];
15053 puDst->au64[3] = uSrc2.au64[2];
15054}
15055
15056
15057/*
15058 * UNPCKHPS / VUNPCKHPS
15059 */
15060#ifdef IEM_WITHOUT_ASSEMBLY
15061IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15062{
15063 RTUINT128U uSrc1 = *puDst;
15064 RTUINT128U uSrc2 = *puSrc;
15065 ASMCompilerBarrier();
15066 puDst->au32[0] = uSrc1.au32[2];
15067 puDst->au32[1] = uSrc2.au32[2];
15068 puDst->au32[2] = uSrc1.au32[3];
15069 puDst->au32[3] = uSrc2.au32[3];
15070}
15071
15072#endif
15073
15074IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15075{
15076 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15077 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15078 ASMCompilerBarrier();
15079 puDst->au32[0] = uSrc1.au32[2];
15080 puDst->au32[1] = uSrc2.au32[2];
15081 puDst->au32[2] = uSrc1.au32[3];
15082 puDst->au32[3] = uSrc2.au32[3];
15083}
15084
15085
15086IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15087{
15088 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15089 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15090 ASMCompilerBarrier();
15091 puDst->au32[0] = uSrc1.au32[2];
15092 puDst->au32[1] = uSrc2.au32[2];
15093 puDst->au32[2] = uSrc1.au32[3];
15094 puDst->au32[3] = uSrc2.au32[3];
15095
15096 puDst->au32[4] = uSrc1.au32[6];
15097 puDst->au32[5] = uSrc2.au32[6];
15098 puDst->au32[6] = uSrc1.au32[7];
15099 puDst->au32[7] = uSrc2.au32[7];
15100}
15101
15102
15103/*
15104 * UNPCKHPD / VUNPCKHPD
15105 */
15106#ifdef IEM_WITHOUT_ASSEMBLY
15107IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15108{
15109 RTUINT128U uSrc1 = *puDst;
15110 RTUINT128U uSrc2 = *puSrc;
15111 ASMCompilerBarrier();
15112 puDst->au64[0] = uSrc1.au64[1];
15113 puDst->au64[1] = uSrc2.au64[1];
15114}
15115
15116#endif
15117
15118IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
15119{
15120 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
15121 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
15122 ASMCompilerBarrier();
15123 puDst->au64[0] = uSrc1.au64[1];
15124 puDst->au64[1] = uSrc2.au64[1];
15125}
15126
15127
15128IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
15129{
15130 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
15131 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
15132 ASMCompilerBarrier();
15133 puDst->au64[0] = uSrc1.au64[1];
15134 puDst->au64[1] = uSrc2.au64[1];
15135 puDst->au64[2] = uSrc1.au64[3];
15136 puDst->au64[3] = uSrc2.au64[3];
15137}
15138
15139
15140/*
15141 * CRC32 (SEE 4.2).
15142 */
15143
15144IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
15145{
15146 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15147}
15148
15149
15150IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
15151{
15152 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15153}
15154
15155IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
15156{
15157 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15158}
15159
15160IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
15161{
15162 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15163}
15164
15165
15166/*
15167 * PTEST (SSE 4.1) - special as it output only EFLAGS.
15168 */
15169#ifdef IEM_WITHOUT_ASSEMBLY
15170IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15171{
15172 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15173 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15174 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15175 fEfl |= X86_EFL_ZF;
15176 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15177 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15178 fEfl |= X86_EFL_CF;
15179 *pfEFlags = fEfl;
15180}
15181#endif
15182
15183IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15184{
15185 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15186 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15187 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
15188 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
15189 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15190 fEfl |= X86_EFL_ZF;
15191 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15192 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
15193 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
15194 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15195 fEfl |= X86_EFL_CF;
15196 *pfEFlags = fEfl;
15197}
15198
15199
15200/*
15201 * PMOVSXBW / VPMOVSXBW
15202 */
15203IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15204{
15205 RTUINT64U uSrc1 = { uSrc };
15206 puDst->ai16[0] = uSrc1.ai8[0];
15207 puDst->ai16[1] = uSrc1.ai8[1];
15208 puDst->ai16[2] = uSrc1.ai8[2];
15209 puDst->ai16[3] = uSrc1.ai8[3];
15210 puDst->ai16[4] = uSrc1.ai8[4];
15211 puDst->ai16[5] = uSrc1.ai8[5];
15212 puDst->ai16[6] = uSrc1.ai8[6];
15213 puDst->ai16[7] = uSrc1.ai8[7];
15214}
15215
15216
15217IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15218{
15219 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15220 puDst->ai16[ 0] = uSrc1.ai8[ 0];
15221 puDst->ai16[ 1] = uSrc1.ai8[ 1];
15222 puDst->ai16[ 2] = uSrc1.ai8[ 2];
15223 puDst->ai16[ 3] = uSrc1.ai8[ 3];
15224 puDst->ai16[ 4] = uSrc1.ai8[ 4];
15225 puDst->ai16[ 5] = uSrc1.ai8[ 5];
15226 puDst->ai16[ 6] = uSrc1.ai8[ 6];
15227 puDst->ai16[ 7] = uSrc1.ai8[ 7];
15228 puDst->ai16[ 8] = uSrc1.ai8[ 8];
15229 puDst->ai16[ 9] = uSrc1.ai8[ 9];
15230 puDst->ai16[10] = uSrc1.ai8[10];
15231 puDst->ai16[11] = uSrc1.ai8[11];
15232 puDst->ai16[12] = uSrc1.ai8[12];
15233 puDst->ai16[13] = uSrc1.ai8[13];
15234 puDst->ai16[14] = uSrc1.ai8[14];
15235 puDst->ai16[15] = uSrc1.ai8[15];
15236}
15237
15238
15239/*
15240 * PMOVSXBD / VPMOVSXBD
15241 */
15242IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15243{
15244 RTUINT32U uSrc1 = { uSrc };
15245 puDst->ai32[0] = uSrc1.ai8[0];
15246 puDst->ai32[1] = uSrc1.ai8[1];
15247 puDst->ai32[2] = uSrc1.ai8[2];
15248 puDst->ai32[3] = uSrc1.ai8[3];
15249}
15250
15251
15252IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15253{
15254 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15255 puDst->ai32[0] = uSrc1.ai8[0];
15256 puDst->ai32[1] = uSrc1.ai8[1];
15257 puDst->ai32[2] = uSrc1.ai8[2];
15258 puDst->ai32[3] = uSrc1.ai8[3];
15259 puDst->ai32[4] = uSrc1.ai8[4];
15260 puDst->ai32[5] = uSrc1.ai8[5];
15261 puDst->ai32[6] = uSrc1.ai8[6];
15262 puDst->ai32[7] = uSrc1.ai8[7];
15263}
15264
15265
15266/*
15267 * PMOVSXBQ / VPMOVSXBQ
15268 */
15269IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15270{
15271 RTUINT16U uSrc1 = { uSrc };
15272 puDst->ai64[0] = uSrc1.ai8[0];
15273 puDst->ai64[1] = uSrc1.ai8[1];
15274}
15275
15276
15277IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15278{
15279 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15280 puDst->ai64[0] = uSrc1.ai8[0];
15281 puDst->ai64[1] = uSrc1.ai8[1];
15282 puDst->ai64[2] = uSrc1.ai8[2];
15283 puDst->ai64[3] = uSrc1.ai8[3];
15284}
15285
15286
15287/*
15288 * PMOVSXWD / VPMOVSXWD
15289 */
15290IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15291{
15292 RTUINT64U uSrc1 = { uSrc };
15293 puDst->ai32[0] = uSrc1.ai16[0];
15294 puDst->ai32[1] = uSrc1.ai16[1];
15295 puDst->ai32[2] = uSrc1.ai16[2];
15296 puDst->ai32[3] = uSrc1.ai16[3];
15297}
15298
15299
15300IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15301{
15302 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15303 puDst->ai32[0] = uSrc1.ai16[0];
15304 puDst->ai32[1] = uSrc1.ai16[1];
15305 puDst->ai32[2] = uSrc1.ai16[2];
15306 puDst->ai32[3] = uSrc1.ai16[3];
15307 puDst->ai32[4] = uSrc1.ai16[4];
15308 puDst->ai32[5] = uSrc1.ai16[5];
15309 puDst->ai32[6] = uSrc1.ai16[6];
15310 puDst->ai32[7] = uSrc1.ai16[7];
15311}
15312
15313
15314/*
15315 * PMOVSXWQ / VPMOVSXWQ
15316 */
15317IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15318{
15319 RTUINT32U uSrc1 = { uSrc };
15320 puDst->ai64[0] = uSrc1.ai16[0];
15321 puDst->ai64[1] = uSrc1.ai16[1];
15322}
15323
15324
15325IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15326{
15327 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15328 puDst->ai64[0] = uSrc1.ai16[0];
15329 puDst->ai64[1] = uSrc1.ai16[1];
15330 puDst->ai64[2] = uSrc1.ai16[2];
15331 puDst->ai64[3] = uSrc1.ai16[3];
15332}
15333
15334
15335/*
15336 * PMOVSXDQ / VPMOVSXDQ
15337 */
15338IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15339{
15340 RTUINT64U uSrc1 = { uSrc };
15341 puDst->ai64[0] = uSrc1.ai32[0];
15342 puDst->ai64[1] = uSrc1.ai32[1];
15343}
15344
15345
15346IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15347{
15348 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15349 puDst->ai64[0] = uSrc1.ai32[0];
15350 puDst->ai64[1] = uSrc1.ai32[1];
15351 puDst->ai64[2] = uSrc1.ai32[2];
15352 puDst->ai64[3] = uSrc1.ai32[3];
15353}
15354
15355
15356/*
15357 * PMOVZXBW / VPMOVZXBW
15358 */
15359IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15360{
15361 RTUINT64U uSrc1 = { uSrc };
15362 puDst->au16[0] = uSrc1.au8[0];
15363 puDst->au16[1] = uSrc1.au8[1];
15364 puDst->au16[2] = uSrc1.au8[2];
15365 puDst->au16[3] = uSrc1.au8[3];
15366 puDst->au16[4] = uSrc1.au8[4];
15367 puDst->au16[5] = uSrc1.au8[5];
15368 puDst->au16[6] = uSrc1.au8[6];
15369 puDst->au16[7] = uSrc1.au8[7];
15370}
15371
15372
15373IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15374{
15375 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15376 puDst->au16[ 0] = uSrc1.au8[ 0];
15377 puDst->au16[ 1] = uSrc1.au8[ 1];
15378 puDst->au16[ 2] = uSrc1.au8[ 2];
15379 puDst->au16[ 3] = uSrc1.au8[ 3];
15380 puDst->au16[ 4] = uSrc1.au8[ 4];
15381 puDst->au16[ 5] = uSrc1.au8[ 5];
15382 puDst->au16[ 6] = uSrc1.au8[ 6];
15383 puDst->au16[ 7] = uSrc1.au8[ 7];
15384 puDst->au16[ 8] = uSrc1.au8[ 8];
15385 puDst->au16[ 9] = uSrc1.au8[ 9];
15386 puDst->au16[10] = uSrc1.au8[10];
15387 puDst->au16[11] = uSrc1.au8[11];
15388 puDst->au16[12] = uSrc1.au8[12];
15389 puDst->au16[13] = uSrc1.au8[13];
15390 puDst->au16[14] = uSrc1.au8[14];
15391 puDst->au16[15] = uSrc1.au8[15];
15392}
15393
15394
15395/*
15396 * PMOVZXBD / VPMOVZXBD
15397 */
15398IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15399{
15400 RTUINT32U uSrc1 = { uSrc };
15401 puDst->au32[0] = uSrc1.au8[0];
15402 puDst->au32[1] = uSrc1.au8[1];
15403 puDst->au32[2] = uSrc1.au8[2];
15404 puDst->au32[3] = uSrc1.au8[3];
15405}
15406
15407
15408IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15409{
15410 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15411 puDst->au32[0] = uSrc1.au8[0];
15412 puDst->au32[1] = uSrc1.au8[1];
15413 puDst->au32[2] = uSrc1.au8[2];
15414 puDst->au32[3] = uSrc1.au8[3];
15415 puDst->au32[4] = uSrc1.au8[4];
15416 puDst->au32[5] = uSrc1.au8[5];
15417 puDst->au32[6] = uSrc1.au8[6];
15418 puDst->au32[7] = uSrc1.au8[7];
15419}
15420
15421
15422/*
15423 * PMOVZXBQ / VPMOVZXBQ
15424 */
15425IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15426{
15427 RTUINT16U uSrc1 = { uSrc };
15428 puDst->au64[0] = uSrc1.au8[0];
15429 puDst->au64[1] = uSrc1.au8[1];
15430}
15431
15432
15433IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15434{
15435 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15436 puDst->au64[0] = uSrc1.au8[0];
15437 puDst->au64[1] = uSrc1.au8[1];
15438 puDst->au64[2] = uSrc1.au8[2];
15439 puDst->au64[3] = uSrc1.au8[3];
15440}
15441
15442
15443/*
15444 * PMOVZXWD / VPMOVZXWD
15445 */
15446IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15447{
15448 RTUINT64U uSrc1 = { uSrc };
15449 puDst->au32[0] = uSrc1.au16[0];
15450 puDst->au32[1] = uSrc1.au16[1];
15451 puDst->au32[2] = uSrc1.au16[2];
15452 puDst->au32[3] = uSrc1.au16[3];
15453}
15454
15455
15456IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15457{
15458 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15459 puDst->au32[0] = uSrc1.au16[0];
15460 puDst->au32[1] = uSrc1.au16[1];
15461 puDst->au32[2] = uSrc1.au16[2];
15462 puDst->au32[3] = uSrc1.au16[3];
15463 puDst->au32[4] = uSrc1.au16[4];
15464 puDst->au32[5] = uSrc1.au16[5];
15465 puDst->au32[6] = uSrc1.au16[6];
15466 puDst->au32[7] = uSrc1.au16[7];
15467}
15468
15469
15470/*
15471 * PMOVZXWQ / VPMOVZXWQ
15472 */
15473IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15474{
15475 RTUINT32U uSrc1 = { uSrc };
15476 puDst->au64[0] = uSrc1.au16[0];
15477 puDst->au64[1] = uSrc1.au16[1];
15478}
15479
15480
15481IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15482{
15483 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15484 puDst->au64[0] = uSrc1.au16[0];
15485 puDst->au64[1] = uSrc1.au16[1];
15486 puDst->au64[2] = uSrc1.au16[2];
15487 puDst->au64[3] = uSrc1.au16[3];
15488}
15489
15490
15491/*
15492 * PMOVZXDQ / VPMOVZXDQ
15493 */
15494IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15495{
15496 RTUINT64U uSrc1 = { uSrc };
15497 puDst->au64[0] = uSrc1.au32[0];
15498 puDst->au64[1] = uSrc1.au32[1];
15499}
15500
15501
15502IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15503{
15504 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15505 puDst->au64[0] = uSrc1.au32[0];
15506 puDst->au64[1] = uSrc1.au32[1];
15507 puDst->au64[2] = uSrc1.au32[2];
15508 puDst->au64[3] = uSrc1.au32[3];
15509}
15510
15511/**
15512 * Converts from the packed IPRT 32-bit (single precision) floating point format to
15513 * the SoftFloat 32-bit floating point format (float32_t).
15514 *
15515 * This is only a structure format conversion, nothing else.
15516 */
15517DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
15518{
15519 float32_t Tmp;
15520 Tmp.v = pr32Val->u;
15521 return Tmp;
15522}
15523
15524
15525/**
15526 * Converts from SoftFloat 32-bit floating point format (float32_t)
15527 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
15528 *
15529 * This is only a structure format conversion, nothing else.
15530 */
15531DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
15532{
15533 pr32Dst->u = r32XSrc.v;
15534 return pr32Dst;
15535}
15536
15537
15538/**
15539 * Converts from the packed IPRT 64-bit (single precision) floating point format to
15540 * the SoftFloat 64-bit floating point format (float64_t).
15541 *
15542 * This is only a structure format conversion, nothing else.
15543 */
15544DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
15545{
15546 float64_t Tmp;
15547 Tmp.v = pr64Val->u;
15548 return Tmp;
15549}
15550
15551
15552/**
15553 * Converts from SoftFloat 64-bit floating point format (float64_t)
15554 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
15555 *
15556 * This is only a structure format conversion, nothing else.
15557 */
15558DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
15559{
15560 pr64Dst->u = r64XSrc.v;
15561 return pr64Dst;
15562}
15563
15564
15565/** Initializer for the SoftFloat state structure. */
15566# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
15567 { \
15568 softfloat_tininess_afterRounding, \
15569 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
15570 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
15571 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
15572 : (uint8_t)softfloat_round_minMag, \
15573 0, \
15574 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
15575 32 /* Rounding precision, not relevant for SIMD. */ \
15576 }
15577
15578#ifdef IEM_WITHOUT_ASSEMBLY
15579
15580/**
15581 * Helper for transfering exception to MXCSR and setting the result value
15582 * accordingly.
15583 *
15584 * @returns Updated MXCSR.
15585 * @param pSoftState The SoftFloat state following the operation.
15586 * @param r32Result The result of the SoftFloat operation.
15587 * @param pr32Result Where to store the result for IEM.
15588 * @param fMxcsr The original MXCSR value.
15589 */
15590DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
15591 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15592{
15593 iemFpSoftF32ToIprt(pr32Result, r32Result);
15594
15595 uint8_t fXcpt = pSoftState->exceptionFlags;
15596 if ( (fMxcsr & X86_MXCSR_FZ)
15597 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
15598 {
15599 /* Underflow masked and flush to zero is set. */
15600 pr32Result->s.uFraction = 0;
15601 pr32Result->s.uExponent = 0;
15602 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15603 }
15604
15605 /* If DAZ is set \#DE is never set. */
15606 if ( fMxcsr & X86_MXCSR_DAZ
15607 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15608 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15609 fXcpt &= ~X86_MXCSR_DE;
15610
15611 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15612}
15613
15614
15615/**
15616 * Helper for transfering exception to MXCSR and setting the result value
15617 * accordingly - ignores Flush-to-Zero.
15618 *
15619 * @returns Updated MXCSR.
15620 * @param pSoftState The SoftFloat state following the operation.
15621 * @param r32Result The result of the SoftFloat operation.
15622 * @param pr32Result Where to store the result for IEM.
15623 * @param fMxcsr The original MXCSR value.
15624 */
15625DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15626 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15627{
15628 iemFpSoftF32ToIprt(pr32Result, r32Result);
15629
15630 uint8_t fXcpt = pSoftState->exceptionFlags;
15631 /* If DAZ is set \#DE is never set. */
15632 if ( fMxcsr & X86_MXCSR_DAZ
15633 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15634 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15635 fXcpt &= ~X86_MXCSR_DE;
15636
15637 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15638}
15639
15640
15641/**
15642 * Helper for transfering exception to MXCSR and setting the result value
15643 * accordingly.
15644 *
15645 * @returns Updated MXCSR.
15646 * @param pSoftState The SoftFloat state following the operation.
15647 * @param r64Result The result of the SoftFloat operation.
15648 * @param pr64Result Where to store the result for IEM.
15649 * @param fMxcsr The original MXCSR value.
15650 */
15651DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15652 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15653{
15654 iemFpSoftF64ToIprt(pr64Result, r64Result);
15655 uint8_t fXcpt = pSoftState->exceptionFlags;
15656 if ( (fMxcsr & X86_MXCSR_FZ)
15657 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15658 {
15659 /* Underflow masked and flush to zero is set. */
15660 iemFpSoftF64ToIprt(pr64Result, r64Result);
15661 pr64Result->s.uFractionHigh = 0;
15662 pr64Result->s.uFractionLow = 0;
15663 pr64Result->s.uExponent = 0;
15664 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15665 }
15666
15667 /* If DAZ is set \#DE is never set. */
15668 if ( fMxcsr & X86_MXCSR_DAZ
15669 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15670 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15671 fXcpt &= ~X86_MXCSR_DE;
15672
15673 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15674}
15675
15676
15677/**
15678 * Helper for transfering exception to MXCSR and setting the result value
15679 * accordingly - ignores Flush-to-Zero.
15680 *
15681 * @returns Updated MXCSR.
15682 * @param pSoftState The SoftFloat state following the operation.
15683 * @param r64Result The result of the SoftFloat operation.
15684 * @param pr64Result Where to store the result for IEM.
15685 * @param fMxcsr The original MXCSR value.
15686 */
15687DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15688 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15689{
15690 iemFpSoftF64ToIprt(pr64Result, r64Result);
15691
15692 uint8_t fXcpt = pSoftState->exceptionFlags;
15693 /* If DAZ is set \#DE is never set. */
15694 if ( fMxcsr & X86_MXCSR_DAZ
15695 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15696 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15697 fXcpt &= ~X86_MXCSR_DE;
15698
15699 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15700}
15701
15702#endif /* IEM_WITHOUT_ASSEMBLY */
15703
15704
15705/**
15706 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15707 * in MXCSR into account.
15708 *
15709 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15710 * @param pr32Val Where to store the result.
15711 * @param fMxcsr The input MXCSR value.
15712 * @param pr32Src The value to use.
15713 */
15714DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15715{
15716 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15717 {
15718 if (fMxcsr & X86_MXCSR_DAZ)
15719 {
15720 /* De-normals are changed to 0. */
15721 pr32Val->s.fSign = pr32Src->s.fSign;
15722 pr32Val->s.uFraction = 0;
15723 pr32Val->s.uExponent = 0;
15724 return 0;
15725 }
15726
15727 *pr32Val = *pr32Src;
15728 return X86_MXCSR_DE;
15729 }
15730
15731 *pr32Val = *pr32Src;
15732 return 0;
15733}
15734
15735
15736/**
15737 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15738 * in MXCSR into account.
15739 *
15740 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15741 * @param pr64Val Where to store the result.
15742 * @param fMxcsr The input MXCSR value.
15743 * @param pr64Src The value to use.
15744 */
15745DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15746{
15747 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15748 {
15749 if (fMxcsr & X86_MXCSR_DAZ)
15750 {
15751 /* De-normals are changed to 0. */
15752 pr64Val->s64.fSign = pr64Src->s.fSign;
15753 pr64Val->s64.uFraction = 0;
15754 pr64Val->s64.uExponent = 0;
15755 return 0;
15756 }
15757
15758 *pr64Val = *pr64Src;
15759 return X86_MXCSR_DE;
15760 }
15761
15762 *pr64Val = *pr64Src;
15763 return 0;
15764}
15765
15766#ifdef IEM_WITHOUT_ASSEMBLY
15767
15768/**
15769 * Validates the given input operands returning whether the operation can continue or whether one
15770 * of the source operands contains a NaN value, setting the output accordingly.
15771 *
15772 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15773 * @param pr32Res Where to store the result in case the operation can't continue.
15774 * @param pr32Val1 The first input operand.
15775 * @param pr32Val2 The second input operand.
15776 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15777 */
15778DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15779{
15780 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15781 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15782 if (cSNan + cQNan == 2)
15783 {
15784 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15785 *pr32Res = *pr32Val1;
15786 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15787 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15788 return true;
15789 }
15790 if (cSNan)
15791 {
15792 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15793 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15794 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15795 *pfMxcsr |= X86_MXCSR_IE;
15796 return true;
15797 }
15798 if (cQNan)
15799 {
15800 /* The QNan operand is placed into the result. */
15801 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15802 return true;
15803 }
15804
15805 Assert(!cQNan && !cSNan);
15806 return false;
15807}
15808
15809
15810/**
15811 * Validates the given double precision input operands returning whether the operation can continue or whether one
15812 * of the source operands contains a NaN value, setting the output accordingly.
15813 *
15814 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15815 * @param pr64Res Where to store the result in case the operation can't continue.
15816 * @param pr64Val1 The first input operand.
15817 * @param pr64Val2 The second input operand.
15818 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15819 */
15820DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15821{
15822 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15823 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15824 if (cSNan + cQNan == 2)
15825 {
15826 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15827 *pr64Res = *pr64Val1;
15828 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15829 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15830 return true;
15831 }
15832 if (cSNan)
15833 {
15834 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15835 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15836 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15837 *pfMxcsr |= X86_MXCSR_IE;
15838 return true;
15839 }
15840 if (cQNan)
15841 {
15842 /* The QNan operand is placed into the result. */
15843 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15844 return true;
15845 }
15846
15847 Assert(!cQNan && !cSNan);
15848 return false;
15849}
15850
15851
15852/**
15853 * Validates the given single input operand returning whether the operation can continue or whether
15854 * contains a NaN value, setting the output accordingly.
15855 *
15856 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15857 * @param pr32Res Where to store the result in case the operation can't continue.
15858 * @param pr32Val The input operand.
15859 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15860 */
15861DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15862{
15863 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15864 {
15865 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15866 *pr32Res = *pr32Val;
15867 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15868 *pfMxcsr |= X86_MXCSR_IE;
15869 return true;
15870 }
15871 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15872 {
15873 /* The QNan operand is placed into the result. */
15874 *pr32Res = *pr32Val;
15875 return true;
15876 }
15877
15878 return false;
15879}
15880
15881
15882/**
15883 * Validates the given double input operand returning whether the operation can continue or whether
15884 * contains a NaN value, setting the output accordingly.
15885 *
15886 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15887 * @param pr64Res Where to store the result in case the operation can't continue.
15888 * @param pr64Val The input operand.
15889 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15890 */
15891DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15892{
15893 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15894 {
15895 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15896 *pr64Res = *pr64Val;
15897 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15898 *pfMxcsr |= X86_MXCSR_IE;
15899 return true;
15900 }
15901 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15902 {
15903 /* The QNan operand is placed into the result. */
15904 *pr64Res = *pr64Val;
15905 return true;
15906 }
15907
15908 return false;
15909}
15910
15911#endif /* IEM_WITHOUT_ASSEMBLY */
15912
15913/**
15914 * ADDPS
15915 */
15916#ifdef IEM_WITHOUT_ASSEMBLY
15917static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15918{
15919 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15920 return fMxcsr;
15921
15922 RTFLOAT32U r32Src1, r32Src2;
15923 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15924 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15925 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15926 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15927 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15928}
15929
15930
15931IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15932{
15933 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15934 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15935 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15936 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15937}
15938#endif
15939
15940
15941/**
15942 * ADDSS
15943 */
15944#ifdef IEM_WITHOUT_ASSEMBLY
15945IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15946{
15947 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15948 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15949 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15950 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15951}
15952#endif
15953
15954
15955/**
15956 * ADDPD
15957 */
15958#ifdef IEM_WITHOUT_ASSEMBLY
15959static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15960{
15961 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15962 return fMxcsr;
15963
15964 RTFLOAT64U r64Src1, r64Src2;
15965 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15966 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15967 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15968 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15969 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15970}
15971
15972
15973IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15974{
15975 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15976 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15977}
15978#endif
15979
15980
15981/**
15982 * ADDSD
15983 */
15984#ifdef IEM_WITHOUT_ASSEMBLY
15985IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15986{
15987 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15988 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15989}
15990#endif
15991
15992
15993/**
15994 * MULPS
15995 */
15996#ifdef IEM_WITHOUT_ASSEMBLY
15997static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15998{
15999 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16000 return fMxcsr;
16001
16002 RTFLOAT32U r32Src1, r32Src2;
16003 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16004 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16005 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16006 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16007 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16008}
16009
16010
16011IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16012{
16013 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16014 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16015 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16016 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16017}
16018#endif
16019
16020
16021/**
16022 * MULSS
16023 */
16024#ifdef IEM_WITHOUT_ASSEMBLY
16025IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16026{
16027 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16028 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16029 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16030 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16031}
16032#endif
16033
16034
16035/**
16036 * MULPD
16037 */
16038#ifdef IEM_WITHOUT_ASSEMBLY
16039static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16040{
16041 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16042 return fMxcsr;
16043
16044 RTFLOAT64U r64Src1, r64Src2;
16045 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16046 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16047 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16048 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16049 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16050}
16051
16052
16053IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16054{
16055 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16056 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16057}
16058#endif
16059
16060
16061/**
16062 * MULSD
16063 */
16064#ifdef IEM_WITHOUT_ASSEMBLY
16065IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16066{
16067 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16068 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16069}
16070#endif
16071
16072
16073/**
16074 * SUBPS
16075 */
16076#ifdef IEM_WITHOUT_ASSEMBLY
16077static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16078{
16079 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16080 return fMxcsr;
16081
16082 RTFLOAT32U r32Src1, r32Src2;
16083 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16084 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16085 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16086 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16087 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16088}
16089
16090
16091IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16092{
16093 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16094 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16095 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16096 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16097}
16098#endif
16099
16100
16101/**
16102 * SUBSS
16103 */
16104#ifdef IEM_WITHOUT_ASSEMBLY
16105IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16106{
16107 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16108 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16109 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16110 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16111}
16112#endif
16113
16114
16115/**
16116 * SUBPD
16117 */
16118#ifdef IEM_WITHOUT_ASSEMBLY
16119static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16120{
16121 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16122 return fMxcsr;
16123
16124 RTFLOAT64U r64Src1, r64Src2;
16125 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16126 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16127 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16128 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16129 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16130}
16131
16132
16133IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16134{
16135 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16136 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16137}
16138#endif
16139
16140
16141/**
16142 * SUBSD
16143 */
16144#ifdef IEM_WITHOUT_ASSEMBLY
16145IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16146{
16147 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16148 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16149}
16150#endif
16151
16152
16153/**
16154 * MINPS
16155 */
16156#ifdef IEM_WITHOUT_ASSEMBLY
16157static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16158{
16159 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16160 {
16161 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16162 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16163 return fMxcsr | X86_MXCSR_IE;
16164 }
16165
16166 RTFLOAT32U r32Src1, r32Src2;
16167 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16168 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16169 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16170 {
16171 *pr32Res = r32Src2;
16172 return fMxcsr;
16173 }
16174
16175 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16176 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16177 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16178 fLe
16179 ? iemFpSoftF32FromIprt(&r32Src1)
16180 : iemFpSoftF32FromIprt(&r32Src2),
16181 pr32Res, fMxcsr);
16182}
16183
16184
16185IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16186{
16187 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16188 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16189 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16190 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16191}
16192#endif
16193
16194
16195/**
16196 * MINSS
16197 */
16198#ifdef IEM_WITHOUT_ASSEMBLY
16199IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16200{
16201 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16202 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16203 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16204 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16205}
16206#endif
16207
16208
16209/**
16210 * MINPD
16211 */
16212#ifdef IEM_WITHOUT_ASSEMBLY
16213static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16214{
16215 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16216 {
16217 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16218 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16219 return fMxcsr | X86_MXCSR_IE;
16220 }
16221
16222 RTFLOAT64U r64Src1, r64Src2;
16223 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16224 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16225 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16226 {
16227 *pr64Res = r64Src2;
16228 return fMxcsr;
16229 }
16230
16231 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16232 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16233 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16234 fLe
16235 ? iemFpSoftF64FromIprt(&r64Src1)
16236 : iemFpSoftF64FromIprt(&r64Src2),
16237 pr64Res, fMxcsr);
16238}
16239
16240
16241IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16242{
16243 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16244 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16245}
16246#endif
16247
16248
16249/**
16250 * MINSD
16251 */
16252#ifdef IEM_WITHOUT_ASSEMBLY
16253IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16254{
16255 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16256 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16257}
16258#endif
16259
16260
16261/**
16262 * DIVPS
16263 */
16264#ifdef IEM_WITHOUT_ASSEMBLY
16265static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16266{
16267 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16268 return fMxcsr;
16269
16270 RTFLOAT32U r32Src1, r32Src2;
16271 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16272 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16273 if (RTFLOAT32U_IS_ZERO(&r32Src2))
16274 {
16275 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
16276 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
16277 {
16278 *pr32Res = g_ar32QNaN[1];
16279 return fMxcsr | X86_MXCSR_IE;
16280 }
16281 else if (RTFLOAT32U_IS_INF(&r32Src1))
16282 {
16283 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16284 return fMxcsr;
16285 }
16286 else
16287 {
16288 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16289 return fMxcsr | X86_MXCSR_ZE;
16290 }
16291 }
16292
16293 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16294 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16295 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16296}
16297
16298
16299IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16300{
16301 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16302 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16303 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16304 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16305}
16306#endif
16307
16308
16309/**
16310 * DIVSS
16311 */
16312#ifdef IEM_WITHOUT_ASSEMBLY
16313IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16314{
16315 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16316 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16317 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16318 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16319}
16320#endif
16321
16322
16323/**
16324 * DIVPD
16325 */
16326#ifdef IEM_WITHOUT_ASSEMBLY
16327static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16328{
16329 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16330 return fMxcsr;
16331
16332 RTFLOAT64U r64Src1, r64Src2;
16333 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16334 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16335 if (RTFLOAT64U_IS_ZERO(&r64Src2))
16336 {
16337 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
16338 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
16339 {
16340 *pr64Res = g_ar64QNaN[1];
16341 return fMxcsr | X86_MXCSR_IE;
16342 }
16343 else if (RTFLOAT64U_IS_INF(&r64Src1))
16344 {
16345 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16346 return fMxcsr;
16347 }
16348 else
16349 {
16350 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16351 return fMxcsr | X86_MXCSR_ZE;
16352 }
16353 }
16354
16355 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16356 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16357 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16358}
16359
16360
16361IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16362{
16363 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16364 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16365}
16366#endif
16367
16368
16369/**
16370 * DIVSD
16371 */
16372#ifdef IEM_WITHOUT_ASSEMBLY
16373IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16374{
16375 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16376 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16377}
16378#endif
16379
16380
16381/**
16382 * MAXPS
16383 */
16384#ifdef IEM_WITHOUT_ASSEMBLY
16385static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16386{
16387 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16388 {
16389 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16390 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16391 return fMxcsr | X86_MXCSR_IE;
16392 }
16393
16394 RTFLOAT32U r32Src1, r32Src2;
16395 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16396 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16397 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16398 {
16399 *pr32Res = r32Src2;
16400 return fMxcsr;
16401 }
16402
16403 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16404 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16405 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16406 fLe
16407 ? iemFpSoftF32FromIprt(&r32Src2)
16408 : iemFpSoftF32FromIprt(&r32Src1),
16409 pr32Res, fMxcsr);
16410}
16411
16412
16413IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16414{
16415 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16416 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16417 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16418 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16419}
16420#endif
16421
16422
16423/**
16424 * MAXSS
16425 */
16426#ifdef IEM_WITHOUT_ASSEMBLY
16427IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16428{
16429 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16430 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16431 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16432 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16433}
16434#endif
16435
16436
16437/**
16438 * MAXPD
16439 */
16440#ifdef IEM_WITHOUT_ASSEMBLY
16441static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16442{
16443 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16444 {
16445 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16446 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16447 return fMxcsr | X86_MXCSR_IE;
16448 }
16449
16450 RTFLOAT64U r64Src1, r64Src2;
16451 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16452 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16453 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16454 {
16455 *pr64Res = r64Src2;
16456 return fMxcsr;
16457 }
16458
16459 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16460 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16461 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16462 fLe
16463 ? iemFpSoftF64FromIprt(&r64Src2)
16464 : iemFpSoftF64FromIprt(&r64Src1),
16465 pr64Res, fMxcsr);
16466}
16467
16468
16469IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16470{
16471 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16472 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16473}
16474#endif
16475
16476
16477/**
16478 * MAXSD
16479 */
16480#ifdef IEM_WITHOUT_ASSEMBLY
16481IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16482{
16483 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16484 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16485}
16486#endif
16487
16488
16489/**
16490 * CVTSS2SD
16491 */
16492#ifdef IEM_WITHOUT_ASSEMBLY
16493static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16494{
16495 RTFLOAT32U r32Src1;
16496 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16497
16498 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16499 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16500 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16501}
16502
16503
16504IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16505{
16506 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
16507 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16508}
16509#endif
16510
16511
16512/**
16513 * CVTSD2SS
16514 */
16515#ifdef IEM_WITHOUT_ASSEMBLY
16516static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16517{
16518 RTFLOAT64U r64Src1;
16519 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16520
16521 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16522 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16523 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16524}
16525
16526
16527IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16528{
16529 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
16530 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16531 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16532 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16533}
16534#endif
16535
16536
16537/**
16538 * HADDPS
16539 */
16540#ifdef IEM_WITHOUT_ASSEMBLY
16541IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16542{
16543 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
16544 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
16545 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
16546 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16547}
16548#endif
16549
16550
16551/**
16552 * HADDPD
16553 */
16554#ifdef IEM_WITHOUT_ASSEMBLY
16555IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16556{
16557 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
16558 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16559}
16560#endif
16561
16562
16563/**
16564 * HSUBPS
16565 */
16566#ifdef IEM_WITHOUT_ASSEMBLY
16567IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16568{
16569 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
16570 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
16571 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
16572 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16573}
16574#endif
16575
16576
16577/**
16578 * HSUBPD
16579 */
16580#ifdef IEM_WITHOUT_ASSEMBLY
16581IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16582{
16583 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
16584 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16585}
16586#endif
16587
16588
16589/**
16590 * SQRTPS
16591 */
16592#ifdef IEM_WITHOUT_ASSEMBLY
16593static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16594{
16595 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16596 return fMxcsr;
16597
16598 RTFLOAT32U r32Src;
16599 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
16600 if (RTFLOAT32U_IS_ZERO(&r32Src))
16601 {
16602 *pr32Res = r32Src;
16603 return fMxcsr;
16604 }
16605 else if (r32Src.s.fSign)
16606 {
16607 *pr32Res = g_ar32QNaN[1];
16608 return fMxcsr | X86_MXCSR_IE;
16609 }
16610
16611 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16612 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16613 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16614}
16615
16616
16617IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16618{
16619 RT_NOREF(puSrc1);
16620
16621 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16622 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16623 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16624 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16625}
16626#endif
16627
16628
16629/**
16630 * SQRTSS
16631 */
16632#ifdef IEM_WITHOUT_ASSEMBLY
16633IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16634{
16635 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16636 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16637 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16638 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16639}
16640#endif
16641
16642
16643/**
16644 * SQRTPD
16645 */
16646#ifdef IEM_WITHOUT_ASSEMBLY
16647static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16648{
16649 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16650 return fMxcsr;
16651
16652 RTFLOAT64U r64Src;
16653 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16654 if (RTFLOAT64U_IS_ZERO(&r64Src))
16655 {
16656 *pr64Res = r64Src;
16657 return fMxcsr;
16658 }
16659 else if (r64Src.s.fSign)
16660 {
16661 *pr64Res = g_ar64QNaN[1];
16662 return fMxcsr | X86_MXCSR_IE;
16663 }
16664
16665 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16666 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16667 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16668}
16669
16670
16671IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16672{
16673 RT_NOREF(puSrc1);
16674
16675 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16676 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16677}
16678#endif
16679
16680
16681/**
16682 * SQRTSD
16683 */
16684#ifdef IEM_WITHOUT_ASSEMBLY
16685IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16686{
16687 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
16688 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16689}
16690#endif
16691
16692
16693#ifdef IEM_WITHOUT_ASSEMBLY
16694/**
16695 * RSQRTPS
16696 */
16697static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16698{
16699 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16700 return fMxcsr;
16701
16702 RTFLOAT32U r32Src;
16703 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16704 if (RTFLOAT32U_IS_ZERO(&r32Src))
16705 {
16706 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16707 return fMxcsr;
16708 }
16709 else if (r32Src.s.fSign)
16710 {
16711 *pr32Res = g_ar32QNaN[1];
16712 return fMxcsr | X86_MXCSR_IE;
16713 }
16714
16715 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16716 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16717 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16718}
16719
16720
16721IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16722{
16723 RT_NOREF(puSrc1);
16724
16725 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16726 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16727 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16728 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16729}
16730
16731
16732/**
16733 * RSQRTSS
16734 */
16735IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16736{
16737 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16738 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16739 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16740 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16741}
16742#endif
16743
16744
16745/**
16746 * RCPPS
16747 */
16748#ifdef IEM_WITHOUT_ASSEMBLY
16749static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16750{
16751 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16752 return fMxcsr;
16753
16754 RTFLOAT32U r32Src;
16755 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16756 if (RTFLOAT32U_IS_ZERO(&r32Src))
16757 {
16758 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16759 return fMxcsr;
16760 }
16761
16762 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16763 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16764 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16765}
16766
16767
16768IEM_DECL_IMPL_DEF(void, iemAImpl_rcpps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16769{
16770 RT_NOREF(puSrc1);
16771
16772 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16773 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16774 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16775 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16776}
16777
16778
16779/**
16780 * RCPSS
16781 */
16782IEM_DECL_IMPL_DEF(void, iemAImpl_rcpss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16783{
16784 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16785 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16786 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16787 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16788}
16789#endif
16790
16791
16792/**
16793 * ADDSUBPS
16794 */
16795#ifdef IEM_WITHOUT_ASSEMBLY
16796IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16797{
16798 RT_NOREF(puSrc1);
16799
16800 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16801 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16802 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16803 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16804}
16805#endif
16806
16807
16808/**
16809 * ADDSUBPD
16810 */
16811#ifdef IEM_WITHOUT_ASSEMBLY
16812IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16813{
16814 RT_NOREF(puSrc1);
16815
16816 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16817 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16818}
16819#endif
16820
16821
16822/**
16823 * CVTPD2PS
16824 */
16825#ifdef IEM_WITHOUT_ASSEMBLY
16826static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16827{
16828 RTFLOAT64U r64Src1;
16829 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16830
16831 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16832 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16833 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16834}
16835
16836
16837IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16838{
16839 RT_NOREF(puSrc1);
16840
16841 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16842 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16843 pResult->uResult.au32[2] = 0;
16844 pResult->uResult.au32[3] = 0;
16845}
16846#endif
16847
16848
16849/**
16850 * CVTPS2PD
16851 */
16852#ifdef IEM_WITHOUT_ASSEMBLY
16853static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16854{
16855 RTFLOAT32U r32Src1;
16856 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16857
16858 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16859 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16860 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16861}
16862
16863
16864IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16865{
16866 RT_NOREF(puSrc1);
16867
16868 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16869 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16870}
16871#endif
16872
16873
16874/**
16875 * CVTDQ2PS
16876 */
16877#ifdef IEM_WITHOUT_ASSEMBLY
16878static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16879{
16880 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16881 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16882 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16883}
16884
16885
16886IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16887{
16888 RT_NOREF(puSrc1);
16889
16890 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16891 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16892 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
16893 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
16894}
16895#endif
16896
16897
16898/**
16899 * CVTPS2DQ
16900 */
16901#ifdef IEM_WITHOUT_ASSEMBLY
16902static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16903{
16904 RTFLOAT32U r32Src;
16905 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16906
16907 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16908 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16909 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16910}
16911
16912
16913IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16914{
16915 RT_NOREF(puSrc1);
16916
16917 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16918 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16919 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16920 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16921}
16922#endif
16923
16924
16925/**
16926 * CVTTPS2DQ
16927 */
16928#ifdef IEM_WITHOUT_ASSEMBLY
16929static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16930{
16931 RTFLOAT32U r32Src;
16932 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16933
16934 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16935 SoftState.roundingMode = softfloat_round_minMag;
16936 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16937 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16938}
16939
16940
16941IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16942{
16943 RT_NOREF(puSrc1);
16944
16945 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16946 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16947 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16948 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16949}
16950#endif
16951
16952
16953/**
16954 * CVTTPD2DQ
16955 */
16956#ifdef IEM_WITHOUT_ASSEMBLY
16957static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16958{
16959 RTFLOAT64U r64Src;
16960 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16961
16962 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16963 SoftState.roundingMode = softfloat_round_minMag;
16964 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16965 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16966}
16967
16968
16969IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16970{
16971 RT_NOREF(puSrc1);
16972
16973 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16974 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16975 pResult->uResult.au64[1] = 0;
16976}
16977#endif
16978
16979
16980/**
16981 * CVTDQ2PD
16982 */
16983#ifdef IEM_WITHOUT_ASSEMBLY
16984static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16985{
16986 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16987 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16988 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16989}
16990
16991
16992IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16993{
16994 RT_NOREF(puSrc1);
16995
16996 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16997 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16998}
16999#endif
17000
17001
17002/**
17003 * CVTPD2DQ
17004 */
17005#ifdef IEM_WITHOUT_ASSEMBLY
17006static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
17007{
17008 RTFLOAT64U r64Src;
17009 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
17010
17011 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17012 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17013 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17014}
17015
17016
17017IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17018{
17019 RT_NOREF(puSrc1);
17020
17021 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
17022 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
17023 pResult->uResult.au64[1] = 0;
17024}
17025#endif
17026
17027
17028/**
17029 * [V]SHUFPS
17030 */
17031#ifdef IEM_WITHOUT_ASSEMBLY
17032IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17033{
17034 RTUINT128U const uSrc1 = *puDst;
17035 RTUINT128U const uSrc2 = *puSrc;
17036 ASMCompilerBarrier();
17037 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17038 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17039 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17040 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17041}
17042#endif
17043
17044
17045IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17046{
17047 RTUINT128U const uSrc1 = *puSrc1;
17048 RTUINT128U const uSrc2 = *puSrc2;
17049 ASMCompilerBarrier();
17050 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17051 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17052 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17053 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17054}
17055
17056
17057IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17058{
17059 RTUINT256U const uSrc1 = *puSrc1;
17060 RTUINT256U const uSrc2 = *puSrc2;
17061 ASMCompilerBarrier();
17062 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
17063 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
17064 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
17065 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
17066
17067 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
17068 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
17069 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
17070 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
17071}
17072
17073
17074/**
17075 * [V]SHUFPD
17076 */
17077#ifdef IEM_WITHOUT_ASSEMBLY
17078IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17079{
17080 RTUINT128U const uSrc1 = *puDst;
17081 RTUINT128U const uSrc2 = *puSrc;
17082 ASMCompilerBarrier();
17083 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17084 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17085}
17086#endif
17087
17088
17089IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17090{
17091 RTUINT128U const uSrc1 = *puSrc1;
17092 RTUINT128U const uSrc2 = *puSrc2;
17093 ASMCompilerBarrier();
17094 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17095 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17096}
17097
17098
17099IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17100{
17101 RTUINT256U const uSrc1 = *puSrc1;
17102 RTUINT256U const uSrc2 = *puSrc2;
17103 ASMCompilerBarrier();
17104 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
17105 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
17106 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
17107 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
17108}
17109
17110
17111/*
17112 * PHMINPOSUW / VPHMINPOSUW
17113 */
17114IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17115{
17116 uint16_t u16Min = puSrc->au16[0];
17117 uint8_t idxMin = 0;
17118
17119 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
17120 if (puSrc->au16[i] < u16Min)
17121 {
17122 u16Min = puSrc->au16[i];
17123 idxMin = i;
17124 }
17125
17126 puDst->au64[0] = 0;
17127 puDst->au64[1] = 0;
17128 puDst->au16[0] = u16Min;
17129 puDst->au16[1] = idxMin;
17130}
17131
17132
17133IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17134{
17135 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
17136}
17137
17138
17139/**
17140 * VPERMILPS
17141 */
17142#ifdef IEM_WITHOUT_ASSEMBLY
17143IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17144{
17145 RTUINT128U const uSrc = *puSrc;
17146 ASMCompilerBarrier();
17147
17148 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17149 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17150 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17151 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17152}
17153
17154
17155IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17156{
17157 RTUINT256U const uSrc = *puSrc;
17158 ASMCompilerBarrier();
17159
17160 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17161 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17162 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17163 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17164
17165 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17166 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17167 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17168 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17169}
17170
17171IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17172{
17173 RTUINT128U const uSrc1 = *puSrc1;
17174 RTUINT128U const uSrc2 = *puSrc2;
17175 ASMCompilerBarrier();
17176
17177 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17178 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17179 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17180 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17181}
17182
17183IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17184{
17185 RTUINT256U const uSrc1 = *puSrc1;
17186 RTUINT256U const uSrc2 = *puSrc2;
17187 ASMCompilerBarrier();
17188
17189 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17190 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17191 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17192 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17193
17194 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17195 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17196 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17197 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17198}
17199#endif
17200
17201
17202IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17203{
17204 RTUINT128U const uSrc = *puSrc;
17205 ASMCompilerBarrier();
17206
17207 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17208 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17209 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17210 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17211}
17212
17213
17214IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17215{
17216 RTUINT256U const uSrc = *puSrc;
17217 ASMCompilerBarrier();
17218
17219 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17220 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17221 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17222 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17223
17224 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17225 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17226 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17227 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17228}
17229
17230IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17231{
17232 RTUINT128U const uSrc1 = *puSrc1;
17233 RTUINT128U const uSrc2 = *puSrc2;
17234 ASMCompilerBarrier();
17235
17236 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17237 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17238 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17239 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17240}
17241
17242IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17243{
17244 RTUINT256U const uSrc1 = *puSrc1;
17245 RTUINT256U const uSrc2 = *puSrc2;
17246 ASMCompilerBarrier();
17247
17248 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17249 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17250 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17251 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17252
17253 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17254 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17255 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17256 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17257}
17258
17259
17260/**
17261 * VPERMILPD
17262 */
17263#ifdef IEM_WITHOUT_ASSEMBLY
17264IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17265{
17266 RTUINT128U const uSrc = *puSrc;
17267 ASMCompilerBarrier();
17268
17269 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17270 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17271}
17272
17273
17274IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17275{
17276 RTUINT256U const uSrc = *puSrc;
17277 ASMCompilerBarrier();
17278
17279 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17280 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17281
17282 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17283 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17284}
17285
17286IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17287{
17288 RTUINT128U const uSrc1 = *puSrc1;
17289 RTUINT128U const uSrc2 = *puSrc2;
17290 ASMCompilerBarrier();
17291
17292 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17293 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17294}
17295
17296IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17297{
17298 RTUINT256U const uSrc1 = *puSrc1;
17299 RTUINT256U const uSrc2 = *puSrc2;
17300 ASMCompilerBarrier();
17301
17302 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17303 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17304
17305 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17306 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17307}
17308#endif
17309
17310
17311IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17312{
17313 RTUINT128U const uSrc = *puSrc;
17314 ASMCompilerBarrier();
17315
17316 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17317 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17318}
17319
17320
17321IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17322{
17323 RTUINT256U const uSrc = *puSrc;
17324 ASMCompilerBarrier();
17325
17326 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17327 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17328
17329 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17330 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17331}
17332
17333IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17334{
17335 RTUINT128U const uSrc1 = *puSrc1;
17336 RTUINT128U const uSrc2 = *puSrc2;
17337 ASMCompilerBarrier();
17338
17339 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17340 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17341}
17342
17343IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17344{
17345 RTUINT256U const uSrc1 = *puSrc1;
17346 RTUINT256U const uSrc2 = *puSrc2;
17347 ASMCompilerBarrier();
17348
17349 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17350 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17351
17352 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17353 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17354}
17355
17356
17357/*
17358 * [V]PBLENDVB
17359 */
17360IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17361{
17362 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17363 if (puMask->au8[i] & RT_BIT(7))
17364 puDst->au8[i] = puSrc->au8[i];
17365}
17366
17367
17368IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17369{
17370 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17371 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17372}
17373
17374
17375IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17376{
17377 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17378 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17379}
17380
17381
17382/*
17383 * [V]BLENDVPS
17384 */
17385IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17386{
17387 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17388 if (puMask->au32[i] & RT_BIT_32(31))
17389 puDst->au32[i] = puSrc->au32[i];
17390}
17391
17392
17393IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17394{
17395 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17396 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17397}
17398
17399
17400IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17401{
17402 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17403 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17404}
17405
17406
17407/*
17408 * [V]BLENDVPD
17409 */
17410IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17411{
17412 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
17413 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
17414}
17415
17416
17417IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17418{
17419 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17420 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17421}
17422
17423
17424IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17425{
17426 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17427 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17428}
17429
17430
17431/**
17432 * [V]PALIGNR
17433 */
17434IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
17435{
17436 uint64_t const u64Src1 = *pu64Dst;
17437 ASMCompilerBarrier();
17438
17439 if (bEvil >= 16)
17440 *pu64Dst = 0;
17441 else if (bEvil >= 8)
17442 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
17443 else
17444 {
17445 uint8_t cShift = bEvil * 8;
17446 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
17447 | (u64Src2 >> cShift);
17448 }
17449}
17450
17451
17452IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17453{
17454 RTUINT128U const uSrc1 = *puDst;
17455 RTUINT128U const uSrc2 = *puSrc;
17456 ASMCompilerBarrier();
17457
17458 puDst->au64[0] = 0;
17459 puDst->au64[1] = 0;
17460 if (bEvil >= 32)
17461 { /* Everything stays 0. */ }
17462 else if (bEvil >= 16)
17463 {
17464 bEvil -= 16;
17465 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17466 puDst->au8[i - bEvil] = uSrc1.au8[i];
17467 }
17468 else
17469 {
17470 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17471 puDst->au8[i] = uSrc2.au8[i + bEvil];
17472 for (uint8_t i = 0; i < bEvil; i++)
17473 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17474 }
17475}
17476
17477
17478IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17479{
17480 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17481 RTUINT128U const uSrc2 = *puSrc2;
17482 ASMCompilerBarrier();
17483
17484 puDst->au64[0] = 0;
17485 puDst->au64[1] = 0;
17486 if (bEvil >= 32)
17487 { /* Everything stays 0. */ }
17488 else if (bEvil >= 16)
17489 {
17490 bEvil -= 16;
17491 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17492 puDst->au8[i - bEvil] = uSrc1.au8[i];
17493 }
17494 else
17495 {
17496 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17497 puDst->au8[i] = uSrc2.au8[i + bEvil];
17498 for (uint8_t i = 0; i < bEvil; i++)
17499 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17500 }
17501}
17502
17503
17504IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17505{
17506 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17507 RTUINT256U const uSrc2 = *puSrc2;
17508 ASMCompilerBarrier();
17509
17510 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
17511 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
17512}
17513
17514
17515/**
17516 * [V]PBLENDW
17517 */
17518IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17519{
17520 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17521 if (bEvil & RT_BIT(i))
17522 puDst->au16[i] = puSrc->au16[i];
17523}
17524
17525
17526IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17527{
17528 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17529 if (bEvil & RT_BIT(i))
17530 puDst->au16[i] = puSrc2->au16[i];
17531 else
17532 puDst->au16[i] = puSrc1->au16[i];
17533}
17534
17535
17536IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17537{
17538 for (uint8_t i = 0; i < 8; i++)
17539 if (bEvil & RT_BIT(i))
17540 {
17541 puDst->au16[ i] = puSrc2->au16[ i];
17542 puDst->au16[8 + i] = puSrc2->au16[8 + i];
17543 }
17544 else
17545 {
17546 puDst->au16[ i] = puSrc1->au16[ i];
17547 puDst->au16[8 + i] = puSrc1->au16[8 + i];
17548 }
17549}
17550
17551
17552/**
17553 * [V]PBLENDD
17554 */
17555IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17556{
17557 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17558 if (bEvil & RT_BIT(i))
17559 puDst->au32[i] = puSrc2->au32[i];
17560 else
17561 puDst->au32[i] = puSrc1->au32[i];
17562}
17563
17564
17565IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17566{
17567 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17568 if (bEvil & RT_BIT(i))
17569 puDst->au32[i] = puSrc2->au32[i];
17570 else
17571 puDst->au32[i] = puSrc1->au32[i];
17572}
17573
17574
17575/**
17576 * [V]BLENDPS
17577 */
17578IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17579{
17580 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17581 if (bEvil & RT_BIT(i))
17582 puDst->au32[i] = puSrc->au32[i];
17583}
17584
17585
17586IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17587{
17588 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17589 if (bEvil & RT_BIT(i))
17590 puDst->au32[i] = puSrc2->au32[i];
17591 else
17592 puDst->au32[i] = puSrc1->au32[i];
17593}
17594
17595
17596IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17597{
17598 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17599 if (bEvil & RT_BIT(i))
17600 puDst->au32[i] = puSrc2->au32[i];
17601 else
17602 puDst->au32[i] = puSrc1->au32[i];
17603}
17604
17605
17606/**
17607 * [V]BLENDPD
17608 */
17609IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17610{
17611 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17612 if (bEvil & RT_BIT(i))
17613 puDst->au64[i] = puSrc->au64[i];
17614}
17615
17616
17617IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17618{
17619 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17620 if (bEvil & RT_BIT(i))
17621 puDst->au64[i] = puSrc2->au64[i];
17622 else
17623 puDst->au64[i] = puSrc1->au64[i];
17624}
17625
17626
17627IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17628{
17629 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17630 if (bEvil & RT_BIT(i))
17631 puDst->au64[i] = puSrc2->au64[i];
17632 else
17633 puDst->au64[i] = puSrc1->au64[i];
17634}
17635
17636
17637/**
17638 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
17639 */
17640
17641static uint8_t iemAImpl_aes_sbox[] = {
17642 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
17643 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
17644 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
17645 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
17646 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
17647 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
17648 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
17649 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
17650 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
17651 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
17652 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
17653 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17654 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
17655 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
17656 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
17657 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
17658};
17659
17660/* The InvS-Box lookup table. */
17661static uint8_t iemAImpl_aes_inv_sbox[] = {
17662 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
17663 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
17664 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
17665 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
17666 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
17667 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
17668 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
17669 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
17670 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
17671 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
17672 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
17673 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
17674 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
17675 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
17676 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
17677 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
17678};
17679
17680/* The ShiftRows lookup table. */
17681static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
17682 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
17683};
17684
17685/* The InvShiftRows lookup table. */
17686static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
17687 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
17688};
17689
17690static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
17691{
17692 RTUINT128U uVal;
17693 int i;
17694
17695 for (i = 0; i < 16; ++i)
17696 uVal.au8[i] = abSubst[puSrc->au8[i]];
17697
17698 return uVal;
17699}
17700
17701static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
17702{
17703 return (u << 1) ^ (((u >> 7) & 1) * 27);
17704}
17705
17706static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
17707{
17708 RTUINT128U uVal;
17709 int i;
17710 uint8_t tmp;
17711
17712 for (i = 0; i < 16; i += 4) {
17713 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
17714 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
17715 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
17716 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
17717 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
17718 }
17719
17720 return uVal;
17721}
17722
17723static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
17724{
17725 RTUINT128U uVal;
17726 int i;
17727
17728 for (i = 0; i < 16; ++i)
17729 uVal.au8[i] = puSrc->au8[abShift[i]];
17730
17731 return uVal;
17732}
17733
17734static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
17735{
17736 uint8_t val;
17737
17738 val = ((b >> 0) & 1) * a;
17739 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
17740 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
17741 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
17742 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
17743
17744 return val;
17745}
17746
17747static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
17748{
17749 RTUINT128U uVal;
17750 int i;
17751
17752 for (i = 0; i < 16; i += 4) {
17753 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
17754 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
17755 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
17756 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
17757 }
17758
17759 return uVal;
17760}
17761
17762static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
17763{
17764 RTUINT32U uTmp;
17765
17766 uTmp.au32[0] = w;
17767 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
17768 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
17769 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
17770 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
17771
17772 return uTmp.au32[0];
17773}
17774
17775static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
17776{
17777 return (w << 24) | (w >> 8);
17778}
17779
17780/**
17781 * [V]AESKEYGENASSIST
17782 */
17783IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
17784{
17785 RTUINT128U uTmp;
17786 uint32_t uRCon = bImm; /* Round constant. */
17787
17788 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
17789 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
17790 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
17791 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
17792
17793 *puDst = uTmp;
17794}
17795
17796
17797/**
17798 * [V]AESIMC
17799 */
17800IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17801{
17802 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
17803}
17804
17805
17806/**
17807 * [V]AESENC
17808 */
17809IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17810{
17811 RTUINT128U uTmp;
17812
17813 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17814 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17815 uTmp = iemAImpl_aes_mix_col(&uTmp);
17816 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17817 uTmp.au64[1] ^= puSrc->au64[1];
17818
17819 *puDst = uTmp;
17820}
17821
17822
17823/**
17824 * [V]AESENCLAST
17825 */
17826IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17827{
17828 RTUINT128U uTmp;
17829
17830 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17831 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17832 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17833 uTmp.au64[1] ^= puSrc->au64[1];
17834
17835 *puDst = uTmp;
17836}
17837
17838
17839/**
17840 * [V]AESDEC
17841 */
17842IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17843{
17844 RTUINT128U uTmp;
17845
17846 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17847 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17848 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17849 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17850 uTmp.au64[1] ^= puSrc->au64[1];
17851
17852 *puDst = uTmp;
17853}
17854
17855
17856/**
17857 * [V]AESDECLAST
17858 */
17859IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17860{
17861 RTUINT128U uTmp;
17862
17863 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17864 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17865 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17866 uTmp.au64[1] ^= puSrc->au64[1];
17867
17868 *puDst = uTmp;
17869}
17870
17871
17872/**
17873 * [V]PCMPISTRI
17874 */
17875
17876/**
17877 * Does the comparisons based on the mode and source input format.
17878 */
17879static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17880{
17881#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17882 do \
17883 { \
17884 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17885 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17886 { \
17887 switch (a_bAggOp) \
17888 { \
17889 case 0: \
17890 case 2: \
17891 case 3: \
17892 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17893 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17894 break; \
17895 case 1: \
17896 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17897 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17898 break; \
17899 default: \
17900 AssertReleaseFailed(); \
17901 } \
17902 } \
17903 } while(0)
17904
17905 uint8_t bAggOp = (bImm >> 2) & 0x3;
17906 switch (bImm & 0x3)
17907 {
17908 case 0:
17909 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17910 break;
17911 case 1:
17912 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17913 break;
17914 case 2:
17915 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17916 break;
17917 case 3:
17918 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17919 break;
17920 default:
17921 AssertReleaseFailed();
17922 }
17923#undef PCMPXSTRX_CMP_CASE
17924}
17925
17926static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17927{
17928 if (bImm & 0x1)
17929 {
17930 /* Words -> 8 elements. */
17931 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17932 if (puSrc->au16[i] == 0)
17933 return i;
17934
17935 return 8;
17936 }
17937 else
17938 {
17939 /* Bytes -> 16 elements. */
17940 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17941 if (puSrc->au8[i] == 0)
17942 return i;
17943
17944 return 16;
17945 }
17946}
17947
17948static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17949{
17950 if (bImm & 0x1)
17951 {
17952 if (i64Len > -8 && i64Len < 8)
17953 return RT_ABS(i64Len);
17954
17955 return 8;
17956 }
17957 else
17958 {
17959 if (i64Len > -16 && i64Len < 16)
17960 return RT_ABS(i64Len);
17961
17962 return 16;
17963 }
17964}
17965
17966/**
17967 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17968 */
17969static const bool g_afCmpOverride[4][4] =
17970{
17971 /* xmm1 AND xmm2/m128 invalid, xmm1 invalid BUT xmm2/m128 valid, xmm1 valid BUT xmm2/m128 invalid, unused dummy/padding for parfait */
17972 { false, false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17973 { false, false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17974 { true, false, false, false }, /* Imm8[3:2] = 10b (equal each) */
17975 { true, true, false, false }, /* Imm8[3:2] = 11b (equal ordered) */
17976};
17977
17978DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17979{
17980 if (fSrc1Valid && fSrc2Valid)
17981 return fCmpRes;
17982
17983 uint8_t const bSrc1Valid = fSrc1Valid ? 2 : 0;
17984 uint8_t const bSrc2Valid = fSrc2Valid ? 1 : 0;
17985 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17986}
17987
17988static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
17989{
17990 uint8_t bAggOp = (bImm >> 2) & 0x3;
17991 uint16_t u16Result = 0;
17992
17993 switch (bAggOp)
17994 {
17995 case 0: /* Equal any */
17996 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17997 {
17998 uint16_t u16Res = 0;
17999 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
18000 {
18001 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
18002 idxSrc1 < idxLen1,
18003 idxSrc2 < idxLen2,
18004 bAggOp))
18005 {
18006 u16Res = RT_BIT(idxSrc2);
18007 break;
18008 }
18009 }
18010
18011 u16Result |= u16Res;
18012 }
18013 break;
18014
18015 case 1: /* Ranges */
18016 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
18017 {
18018 uint16_t u16Res = 0;
18019 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
18020 {
18021 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
18022 idxSrc1 < idxLen1,
18023 idxSrc2 < idxLen2,
18024 bAggOp)
18025 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
18026 (idxSrc1 + 1) < idxLen1,
18027 idxSrc2 < idxLen2,
18028 bAggOp))
18029 {
18030 u16Res = RT_BIT(idxSrc2);
18031 break;
18032 }
18033 }
18034
18035 u16Result |= u16Res;
18036 }
18037 break;
18038
18039 case 2: /* Equal each */
18040 for (uint8_t i = 0; i < cElems; i++)
18041 {
18042 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
18043 i < idxLen1,
18044 i < idxLen2,
18045 bAggOp))
18046 u16Result |= RT_BIT(i);
18047 }
18048 break;
18049
18050 case 3: /* Equal ordered */
18051 u16Result = 0;
18052 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
18053 {
18054 uint16_t u16Res = RT_BIT(idxSrc2);
18055 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
18056 {
18057 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
18058 idxSrc1 < idxLen1,
18059 k < idxLen2,
18060 bAggOp))
18061 {
18062 u16Res = 0;
18063 break;
18064 }
18065 }
18066
18067 u16Result |= u16Res;
18068 }
18069 break;
18070 }
18071
18072 /* Polarity selection. */
18073 switch ((bImm >> 4) & 0x3)
18074 {
18075 case 0:
18076 case 2:
18077 /* Nothing to do. */
18078 break;
18079 case 1:
18080 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
18081 break;
18082 case 3:
18083 u16Result ^= RT_BIT(idxLen2) - 1;
18084 break;
18085 default:
18086 AssertReleaseFailed();
18087 }
18088
18089 return u16Result;
18090}
18091
18092DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
18093{
18094 uint32_t fEFlags = 0;
18095
18096 if (u16Result)
18097 fEFlags |= X86_EFL_CF;
18098 if (cLen2 < cElems)
18099 fEFlags |= X86_EFL_ZF;
18100 if (cLen1 < cElems)
18101 fEFlags |= X86_EFL_SF;
18102 if (u16Result & 0x1)
18103 fEFlags |= X86_EFL_OF;
18104 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
18105}
18106
18107DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
18108 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
18109{
18110 bool afCmpRes[16][16];
18111 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18112
18113 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
18114 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
18115 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
18116
18117 return u16Result;
18118}
18119
18120DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18121{
18122 if (bImm & RT_BIT(6))
18123 {
18124 /* Index for MSB set. */
18125 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
18126 if (idxMsb)
18127 *pu32Ecx = idxMsb - 1;
18128 else
18129 *pu32Ecx = cElems;
18130 }
18131 else
18132 {
18133 /* Index for LSB set. */
18134 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
18135 if (idxLsb)
18136 *pu32Ecx = idxLsb - 1;
18137 else
18138 *pu32Ecx = cElems;
18139 }
18140}
18141
18142IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18143{
18144 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18145 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18146 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18147
18148 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18149 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
18150}
18151
18152
18153/**
18154 * [V]PCMPESTRI
18155 */
18156IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18157{
18158 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18159 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18160 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18161
18162 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18163 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
18164}
18165
18166
18167/**
18168 * [V]PCMPISTRM
18169 */
18170DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18171{
18172 if (bImm & RT_BIT(6))
18173 {
18174 /* Generate a mask. */
18175 if (cElems == 8)
18176 {
18177 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18178 if (u16Result & RT_BIT(i))
18179 puDst->au16[i] = 0xffff;
18180 else
18181 puDst->au16[i] = 0;
18182 }
18183 else
18184 {
18185 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
18186 if (u16Result & RT_BIT(i))
18187 puDst->au8[i] = 0xff;
18188 else
18189 puDst->au8[i] = 0;
18190 }
18191 }
18192 else
18193 {
18194 /* Store the result. */
18195 puDst->au64[0] = u16Result;
18196 puDst->au64[1] = 0;
18197 }
18198}
18199
18200IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18201{
18202 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18203 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18204 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18205
18206 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18207 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18208}
18209
18210
18211/**
18212 * [V]PCMPESTRM
18213 */
18214IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18215{
18216 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18217 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18218 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18219
18220 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18221 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18222}
18223
18224
18225/*
18226 * [V]PCLMULQDQ
18227 */
18228IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18229{
18230 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
18231}
18232
18233
18234IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18235{
18236 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
18237 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
18238
18239 puDst->au64[0] = 0;
18240 puDst->au64[1] = 0;
18241
18242 /*
18243 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
18244 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
18245 * and squeeze out some optimizations.
18246 */
18247 if (uSrc1 & 0x1)
18248 puDst->au64[0] = uSrc2;
18249
18250 uSrc1 >>= 1;
18251
18252 uint8_t iDigit = 1;
18253 while (uSrc1)
18254 {
18255 if (uSrc1 & 0x1)
18256 {
18257 puDst->au64[0] ^= (uSrc2 << iDigit);
18258 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
18259 }
18260
18261 uSrc1 >>= 1;
18262 iDigit++;
18263 }
18264}
18265
18266
18267/**
18268 * [V]PINSRW
18269 */
18270#ifdef IEM_WITHOUT_ASSEMBLY
18271IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
18272{
18273 uint8_t cShift = (bEvil & 0x3) * 16;
18274 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
18275}
18276
18277
18278IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
18279{
18280 puDst->au16[bEvil & 0x7] = u16Src;
18281}
18282#endif
18283
18284
18285IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
18286{
18287 *puDst = *puSrc;
18288 puDst->au16[bEvil & 0x7] = u16Src;
18289}
18290
18291
18292/**
18293 * [V]PEXTRW
18294 */
18295#ifdef IEM_WITHOUT_ASSEMBLY
18296IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
18297{
18298 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
18299}
18300
18301
18302IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
18303{
18304 *pu16Dst = puSrc->au16[bEvil & 0x7];
18305}
18306
18307#endif
18308
18309IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
18310{
18311 *pu16Dst = puSrc->au16[bEvil & 0x7];
18312}
18313
18314
18315/**
18316 * [V]MOVMSKPS
18317 */
18318#ifdef IEM_WITHOUT_ASSEMBLY
18319IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18320{
18321 *pu8Dst = puSrc->au32[0] >> 31;
18322 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18323 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18324 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18325}
18326
18327#endif
18328
18329IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18330{
18331 *pu8Dst = puSrc->au32[0] >> 31;
18332 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18333 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18334 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18335}
18336
18337
18338IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18339{
18340 *pu8Dst = puSrc->au32[0] >> 31;
18341 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18342 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18343 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18344 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
18345 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
18346 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
18347 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
18348}
18349
18350
18351/**
18352 * [V]MOVMSKPD
18353 */
18354#ifdef IEM_WITHOUT_ASSEMBLY
18355IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18356{
18357 *pu8Dst = puSrc->au64[0] >> 63;
18358 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18359}
18360
18361#endif
18362
18363IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18364{
18365 *pu8Dst = puSrc->au64[0] >> 63;
18366 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18367}
18368
18369
18370IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18371{
18372 *pu8Dst = puSrc->au64[0] >> 63;
18373 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18374 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
18375 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
18376}
18377
18378
18379/**
18380 * CVTTSD2SI
18381 */
18382#ifdef IEM_WITHOUT_ASSEMBLY
18383IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
18384{
18385 RTFLOAT64U r64Src;
18386
18387 r64Src.u = *pu64Src;
18388 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18389
18390 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18391 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18392 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18393}
18394
18395
18396IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
18397{
18398 RTFLOAT64U r64Src;
18399
18400 r64Src.u = *pu64Src;
18401 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18402
18403 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18404 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18405 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18406}
18407#endif
18408
18409
18410/**
18411 * CVTSD2SI
18412 */
18413#ifdef IEM_WITHOUT_ASSEMBLY
18414IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
18415{
18416 RTFLOAT64U r64Src;
18417
18418 r64Src.u = *pu64Src;
18419 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18420
18421 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18422 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18423 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18424}
18425
18426
18427IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
18428{
18429 RTFLOAT64U r64Src;
18430
18431 r64Src.u = *pu64Src;
18432 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18433
18434 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18435 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18436 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18437}
18438#endif
18439
18440
18441/**
18442 * CVTTSS2SI
18443 */
18444#ifdef IEM_WITHOUT_ASSEMBLY
18445IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
18446{
18447 RTFLOAT32U r32Src;
18448
18449 r32Src.u = *pu32Src;
18450 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18451
18452 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18453 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18454 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18455}
18456
18457
18458IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
18459{
18460 RTFLOAT32U r32Src;
18461
18462 r32Src.u = *pu32Src;
18463 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18464
18465 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18466 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18467 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18468}
18469#endif
18470
18471
18472/**
18473 * CVTSS2SI
18474 */
18475#ifdef IEM_WITHOUT_ASSEMBLY
18476IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
18477{
18478 RTFLOAT32U r32Src;
18479
18480 r32Src.u = *pu32Src;
18481 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18482
18483 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18484 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18485 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18486}
18487
18488
18489IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
18490{
18491 RTFLOAT32U r32Src;
18492
18493 r32Src.u = *pu32Src;
18494 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18495
18496 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18497 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18498 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18499}
18500#endif
18501
18502
18503/**
18504 * CVTSI2SD
18505 */
18506#ifdef IEM_WITHOUT_ASSEMBLY
18507IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
18508{
18509 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18510 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
18511 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
18512}
18513
18514
18515IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
18516{
18517 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18518 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
18519 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
18520}
18521#endif
18522
18523
18524/**
18525 * CVTSI2SS
18526 */
18527#ifdef IEM_WITHOUT_ASSEMBLY
18528IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
18529{
18530 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18531 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
18532 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
18533}
18534
18535
18536IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
18537{
18538 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18539 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
18540 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
18541}
18542#endif
18543
18544
18545/**
18546 * [V]UCOMISS
18547 */
18548#ifdef IEM_WITHOUT_ASSEMBLY
18549IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18550{
18551 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18552
18553 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
18554 {
18555 *pfMxcsr |= X86_MXCSR_IE;
18556 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18557 }
18558 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
18559 {
18560 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18561 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18562 }
18563 else
18564 {
18565 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18566
18567 RTFLOAT32U r32Src1, r32Src2;
18568 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
18569 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
18570
18571 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18572 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18573 if (f32_eq(f32Src1, f32Src2, &SoftState))
18574 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18575 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18576 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18577 /* else: GREATER_THAN 000 */
18578
18579 *pfMxcsr |= fDe;
18580 }
18581
18582 *pfEFlags = fEFlagsNew;
18583}
18584#endif
18585
18586IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18587{
18588 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18589}
18590
18591
18592/**
18593 * [V]UCOMISD
18594 */
18595#ifdef IEM_WITHOUT_ASSEMBLY
18596IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18597{
18598 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18599
18600 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
18601 {
18602 *pfMxcsr |= X86_MXCSR_IE;
18603 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18604 }
18605 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
18606 {
18607 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18608 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18609 }
18610 else
18611 {
18612 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18613
18614 RTFLOAT64U r64Src1, r64Src2;
18615 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0])
18616 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
18617
18618 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18619 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18620 if (f64_eq(f64Src1, f64Src2, &SoftState))
18621 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18622 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18623 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18624 /* else: GREATER_THAN 000 */
18625
18626 *pfMxcsr |= fDe;
18627 }
18628
18629 *pfEFlags = fEFlagsNew;
18630}
18631#endif
18632
18633IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18634{
18635 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18636}
18637
18638
18639/**
18640 * [V]COMISS
18641 */
18642#ifdef IEM_WITHOUT_ASSEMBLY
18643IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18644{
18645 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18646
18647 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
18648 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
18649 {
18650 *pfMxcsr |= X86_MXCSR_IE;
18651 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18652 }
18653 else
18654 {
18655 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18656
18657 RTFLOAT32U r32Src1, r32Src2;
18658 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0])
18659 | iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
18660
18661 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18662 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18663 if (f32_eq(f32Src1, f32Src2, &SoftState))
18664 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18665 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18666 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18667 /* else: GREATER_THAN 000 */
18668
18669 *pfMxcsr |= fDe;
18670 }
18671
18672 *pfEFlags = fEFlagsNew;
18673}
18674#endif
18675
18676
18677IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18678{
18679 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18680}
18681
18682
18683/**
18684 * [V]COMISD
18685 */
18686#ifdef IEM_WITHOUT_ASSEMBLY
18687IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18688{
18689 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18690
18691 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
18692 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
18693 {
18694 *pfMxcsr |= X86_MXCSR_IE;
18695 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18696 }
18697 else
18698 {
18699 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18700
18701 RTFLOAT64U r64Src1, r64Src2;
18702 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
18703 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
18704
18705 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18706 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18707 if (f64_eq(f64Src1, f64Src2, &SoftState))
18708 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18709 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18710 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18711 /* else: GREATER_THAN 000 */
18712
18713 *pfMxcsr |= fDe;
18714 }
18715
18716 *pfEFlags = fEFlagsNew;
18717}
18718#endif
18719
18720IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18721{
18722 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18723}
18724
18725
18726/**
18727 * CMPPS / CMPPD / CMPSS / CMPSD
18728 */
18729#ifdef IEM_WITHOUT_ASSEMBLY
18730/**
18731 * A compare truth table entry.
18732 */
18733typedef struct CMPTRUTHTBLENTRY
18734{
18735 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
18736 bool fSignalsOnQNan;
18737 /** The boolean result when the input operands are unordered. */
18738 bool fUnordered;
18739 /** The boolean result when A = B. */
18740 bool fEqual;
18741 /** The boolean result when A < B. */
18742 bool fLowerThan;
18743 /** The boolean result when A > B. */
18744 bool fGreaterThan;
18745} CMPTRUTHTBLENTRY;
18746/** Pointer to a const truth table entry. */
18747typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
18748
18749
18750/** The compare truth table (indexed by immediate). */
18751static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
18752{
18753 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
18754 /* 00H (EQ_OQ) */ { false, false, true, false, false },
18755 /* 01H (LT_OS) */ { true, false, false, true, false },
18756 /* 02H (LE_OS) */ { true, false, true, true, false },
18757 /* 03H (UNORD_Q) */ { false, true, false, false, false },
18758 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
18759 /* 05H (NLT_US) */ { true, true, true, false, true },
18760 /* 06H (NLE_US) */ { true, true, false, false, true },
18761 /* 07H (ORQ_Q) */ { false, false, true, true, true },
18762 /** @todo AVX variants. */
18763};
18764
18765
18766static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
18767{
18768 bool fRes;
18769 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18770
18771 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
18772 {
18773 *pfMxcsr |= X86_MXCSR_IE;
18774 fRes = g_aCmpTbl[bEvil].fUnordered;
18775 }
18776 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
18777 {
18778 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18779 *pfMxcsr |= X86_MXCSR_IE;
18780 fRes = g_aCmpTbl[bEvil].fUnordered;
18781 }
18782 else
18783 {
18784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18785
18786 RTFLOAT32U r32Src1, r32Src2;
18787 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
18788 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
18789
18790 *pfMxcsr |= fDe;
18791 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18792 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18793 if (f32_eq(f32Src1, f32Src2, &SoftState))
18794 fRes = g_aCmpTbl[bEvil].fEqual;
18795 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18796 fRes = g_aCmpTbl[bEvil].fLowerThan;
18797 else
18798 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18799 }
18800
18801 return fRes;
18802}
18803
18804
18805static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
18806{
18807 bool fRes;
18808 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18809
18810 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
18811 {
18812 *pfMxcsr |= X86_MXCSR_IE;
18813 fRes = g_aCmpTbl[bEvil].fUnordered;
18814 }
18815 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
18816 {
18817 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18818 *pfMxcsr |= X86_MXCSR_IE;
18819 fRes = g_aCmpTbl[bEvil].fUnordered;
18820 }
18821 else
18822 {
18823 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18824
18825 RTFLOAT64U r64Src1, r64Src2;
18826 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
18827 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
18828
18829 *pfMxcsr |= fDe;
18830 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18831 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18832 if (f64_eq(f64Src1, f64Src2, &SoftState))
18833 fRes = g_aCmpTbl[bEvil].fEqual;
18834 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18835 fRes = g_aCmpTbl[bEvil].fLowerThan;
18836 else
18837 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18838 }
18839
18840 return fRes;
18841}
18842
18843
18844IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18845{
18846 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18847 {
18848 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18849 puDst->au32[i] = UINT32_MAX;
18850 else
18851 puDst->au32[i] = 0;
18852 }
18853}
18854
18855
18856IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18857{
18858 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18859 {
18860 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18861 puDst->au64[i] = UINT64_MAX;
18862 else
18863 puDst->au64[i] = 0;
18864 }
18865}
18866
18867
18868IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18869{
18870 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18871 puDst->au32[0] = UINT32_MAX;
18872 else
18873 puDst->au32[0] = 0;
18874
18875 puDst->au32[1] = pSrc->uSrc1.au32[1];
18876 puDst->au64[1] = pSrc->uSrc1.au64[1];
18877}
18878
18879
18880IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18881{
18882 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18883 puDst->au64[0] = UINT64_MAX;
18884 else
18885 puDst->au64[0] = 0;
18886
18887 puDst->au64[1] = pSrc->uSrc1.au64[1];
18888}
18889#endif
18890
18891
18892/**
18893 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18894 */
18895
18896#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18897#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18898#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18899
18900#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18901
18902DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18903{
18904 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18905 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18906
18907 fMxcsr &= ~X86_MXCSR_RC_MASK;
18908 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18909 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18910}
18911
18912static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18913{
18914 RTFLOAT32U r32Src, r32Dst;
18915 float32_t f32Src;
18916 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18917 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18918
18919 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18920 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18921
18922 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18923 return r32Dst;
18924}
18925
18926static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18927{
18928 RTFLOAT64U r64Src, r64Dst;
18929 float64_t f64Src;
18930 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18931 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18932
18933 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18934 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18935
18936 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18937 return r64Dst;
18938}
18939
18940#ifdef IEM_WITHOUT_ASSEMBLY
18941IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18942{
18943 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18944 puDst->au32[1] = pSrc->uSrc1.au32[1];
18945 puDst->au64[1] = pSrc->uSrc1.au64[1];
18946}
18947
18948
18949IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18950{
18951 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18952 puDst->au64[1] = pSrc->uSrc1.au64[1];
18953}
18954#endif
18955
18956IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18957{
18958 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18959 {
18960 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18961 }
18962}
18963
18964
18965IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18966{
18967 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18968 {
18969 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18970 }
18971}
18972
18973/**
18974 * CVTPD2PI
18975 */
18976#ifdef IEM_WITHOUT_ASSEMBLY
18977static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18978{
18979 RTFLOAT64U r64Src;
18980 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18981
18982 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18983 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18984 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18985}
18986
18987
18988IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18989{
18990 RTUINT64U u64Res;
18991 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18992 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18993
18994 *pu64Dst = u64Res.u;
18995 *pfMxcsr = fMxcsrOut;
18996}
18997#endif
18998
18999
19000/**
19001 * CVTTPD2PI
19002 */
19003#ifdef IEM_WITHOUT_ASSEMBLY
19004static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
19005{
19006 RTFLOAT64U r64Src;
19007 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
19008
19009 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19010 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
19011 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19012}
19013
19014
19015IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
19016{
19017 RTUINT64U u64Res;
19018 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
19019 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
19020
19021 *pu64Dst = u64Res.u;
19022 *pfMxcsr = fMxcsrOut;
19023}
19024#endif
19025
19026
19027/**
19028 * CVTPI2PS
19029 */
19030#ifdef IEM_WITHOUT_ASSEMBLY
19031static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
19032{
19033 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19034 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
19035 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
19036}
19037
19038
19039IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
19040{
19041 RTUINT64U uSrc = { u64Src };
19042 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
19043 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
19044 *pfMxcsr = fMxcsrOut;
19045}
19046#endif
19047
19048
19049/**
19050 * CVTPI2PD
19051 */
19052#ifdef IEM_WITHOUT_ASSEMBLY
19053static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
19054{
19055 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19056 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
19057 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
19058}
19059
19060
19061IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
19062{
19063 RTUINT64U uSrc = { u64Src };
19064 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
19065 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
19066 *pfMxcsr = fMxcsrOut;
19067}
19068#endif
19069
19070
19071/**
19072 * CVTPS2PI
19073 */
19074#ifdef IEM_WITHOUT_ASSEMBLY
19075static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
19076{
19077 RTFLOAT32U r32Src;
19078 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
19079
19080 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19081 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
19082 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19083}
19084
19085
19086IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
19087{
19088 RTUINT64U uDst;
19089 RTUINT64U uSrc = { u64Src };
19090 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
19091 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
19092 *pu64Dst = uDst.u;
19093 *pfMxcsr = fMxcsrOut;
19094}
19095#endif
19096
19097
19098/**
19099 * CVTTPS2PI
19100 */
19101#ifdef IEM_WITHOUT_ASSEMBLY
19102static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
19103{
19104 RTFLOAT32U r32Src;
19105 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
19106
19107 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
19108 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
19109 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
19110}
19111
19112
19113IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
19114{
19115 RTUINT64U uDst;
19116 RTUINT64U uSrc = { u64Src };
19117 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
19118 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
19119 *pu64Dst = uDst.u;
19120 *pfMxcsr = fMxcsrOut;
19121}
19122#endif
19123
19124/**
19125 * RDRAND
19126 */
19127IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
19128{
19129 *puDst = 0;
19130 *pEFlags &= ~X86_EFL_STATUS_BITS;
19131 *pEFlags |= X86_EFL_CF;
19132}
19133
19134IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
19135{
19136 *puDst = 0;
19137 *pEFlags &= ~X86_EFL_STATUS_BITS;
19138 *pEFlags |= X86_EFL_CF;
19139}
19140
19141IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
19142{
19143 *puDst = 0;
19144 *pEFlags &= ~X86_EFL_STATUS_BITS;
19145 *pEFlags |= X86_EFL_CF;
19146}
19147
19148/**
19149 * RDSEED
19150 */
19151IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
19152{
19153 *puDst = 0;
19154 *pEFlags &= ~X86_EFL_STATUS_BITS;
19155 *pEFlags |= X86_EFL_CF;
19156}
19157
19158IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
19159{
19160 *puDst = 0;
19161 *pEFlags &= ~X86_EFL_STATUS_BITS;
19162 *pEFlags |= X86_EFL_CF;
19163}
19164
19165IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
19166{
19167 *puDst = 0;
19168 *pEFlags &= ~X86_EFL_STATUS_BITS;
19169 *pEFlags |= X86_EFL_CF;
19170}
19171
19172
19173/**
19174 * SHA1NEXTE
19175 */
19176IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19177{
19178 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
19179
19180 puDst->au32[0] = puSrc->au32[0];
19181 puDst->au32[1] = puSrc->au32[1];
19182 puDst->au32[2] = puSrc->au32[2];
19183 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
19184}
19185
19186/**
19187 * SHA1MSG1
19188 */
19189IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19190{
19191 uint32_t u32W0 = puDst->au32[3];
19192 uint32_t u32W1 = puDst->au32[2];
19193 uint32_t u32W2 = puDst->au32[1];
19194 uint32_t u32W3 = puDst->au32[0];
19195 uint32_t u32W4 = puSrc->au32[3];
19196 uint32_t u32W5 = puSrc->au32[2];
19197
19198 puDst->au32[3] = u32W2 ^ u32W0;
19199 puDst->au32[2] = u32W3 ^ u32W1;
19200 puDst->au32[1] = u32W4 ^ u32W2;
19201 puDst->au32[0] = u32W5 ^ u32W3;
19202}
19203
19204/**
19205 * SHA1MSG2
19206 */
19207IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19208{
19209 uint32_t u32W13 = puSrc->au32[2];
19210 uint32_t u32W14 = puSrc->au32[1];
19211 uint32_t u32W15 = puSrc->au32[0];
19212 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
19213 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
19214 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
19215 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
19216
19217 puDst->au32[3] = u32W16;
19218 puDst->au32[2] = u32W17;
19219 puDst->au32[1] = u32W18;
19220 puDst->au32[0] = u32W19;
19221}
19222
19223/**
19224 * SHA1RNDS4
19225 */
19226typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
19227typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
19228
19229static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19230{
19231 return (u32B & u32C) ^ (~u32B & u32D);
19232}
19233
19234static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19235{
19236 return u32B ^ u32C ^ u32D;
19237}
19238
19239static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19240{
19241 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
19242}
19243
19244static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19245{
19246 return u32B ^ u32C ^ u32D;
19247}
19248
19249IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19250{
19251 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
19252 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
19253
19254 uint32_t au32A[5];
19255 uint32_t au32B[5];
19256 uint32_t au32C[5];
19257 uint32_t au32D[5];
19258 uint32_t au32E[5];
19259 uint32_t au32W[4];
19260 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
19261 uint32_t u32K = s_au32K[bEvil & 0x3];
19262
19263 au32A[0] = puDst->au32[3];
19264 au32B[0] = puDst->au32[2];
19265 au32C[0] = puDst->au32[1];
19266 au32D[0] = puDst->au32[0];
19267 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
19268 au32W[i] = puSrc->au32[3 - i];
19269
19270 /* Round 0 is a bit different than the other rounds. */
19271 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
19272 au32B[1] = au32A[0];
19273 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
19274 au32D[1] = au32C[0];
19275 au32E[1] = au32D[0];
19276
19277 for (uint32_t i = 1; i <= 3; i++)
19278 {
19279 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
19280 au32B[i + 1] = au32A[i];
19281 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
19282 au32D[i + 1] = au32C[i];
19283 au32E[i + 1] = au32D[i];
19284 }
19285
19286 puDst->au32[3] = au32A[4];
19287 puDst->au32[2] = au32B[4];
19288 puDst->au32[1] = au32C[4];
19289 puDst->au32[0] = au32D[4];
19290}
19291
19292
19293/**
19294 * SHA256MSG1
19295 */
19296DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
19297{
19298 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
19299}
19300
19301IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19302{
19303 uint32_t u32W4 = puSrc->au32[0];
19304 uint32_t u32W3 = puDst->au32[3];
19305 uint32_t u32W2 = puDst->au32[2];
19306 uint32_t u32W1 = puDst->au32[1];
19307 uint32_t u32W0 = puDst->au32[0];
19308
19309 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
19310 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
19311 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
19312 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
19313}
19314
19315/**
19316 * SHA256MSG2
19317 */
19318DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
19319{
19320 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
19321}
19322
19323IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19324{
19325 uint32_t u32W14 = puSrc->au32[2];
19326 uint32_t u32W15 = puSrc->au32[3];
19327 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
19328 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
19329 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
19330 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
19331
19332 puDst->au32[3] = u32W19;
19333 puDst->au32[2] = u32W18;
19334 puDst->au32[1] = u32W17;
19335 puDst->au32[0] = u32W16;
19336}
19337
19338/**
19339 * SHA256RNDS2
19340 */
19341DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19342{
19343 return (u32X & u32Y) ^ (~u32X & u32Z);
19344}
19345
19346DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19347{
19348 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
19349}
19350
19351DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
19352{
19353 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
19354}
19355
19356DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
19357{
19358 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
19359}
19360
19361IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
19362{
19363 uint32_t au32A[3];
19364 uint32_t au32B[3];
19365 uint32_t au32C[3];
19366 uint32_t au32D[3];
19367 uint32_t au32E[3];
19368 uint32_t au32F[3];
19369 uint32_t au32G[3];
19370 uint32_t au32H[3];
19371 uint32_t au32WK[2];
19372
19373 au32A[0] = puSrc->au32[3];
19374 au32B[0] = puSrc->au32[2];
19375 au32C[0] = puDst->au32[3];
19376 au32D[0] = puDst->au32[2];
19377 au32E[0] = puSrc->au32[1];
19378 au32F[0] = puSrc->au32[0];
19379 au32G[0] = puDst->au32[1];
19380 au32H[0] = puDst->au32[0];
19381
19382 au32WK[0] = puXmm0Constants->au32[0];
19383 au32WK[1] = puXmm0Constants->au32[1];
19384
19385 for (uint32_t i = 0; i < 2; i++)
19386 {
19387 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19388 + iemAImpl_sha256_upper_sigma1(au32E[i])
19389 + au32WK[i]
19390 + au32H[i]
19391 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
19392 + iemAImpl_sha256_upper_sigma0(au32A[i]);
19393 au32B[i + 1] = au32A[i];
19394 au32C[i + 1] = au32B[i];
19395 au32D[i + 1] = au32C[i];
19396 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19397 + iemAImpl_sha256_upper_sigma1(au32E[i])
19398 + au32WK[i]
19399 + au32H[i]
19400 + au32D[i];
19401 au32F[i + 1] = au32E[i];
19402 au32G[i + 1] = au32F[i];
19403 au32H[i + 1] = au32G[i];
19404 }
19405
19406 puDst->au32[3] = au32A[2];
19407 puDst->au32[2] = au32B[2];
19408 puDst->au32[1] = au32E[2];
19409 puDst->au32[0] = au32F[2];
19410}
19411
19412
19413/**
19414 * ADCX
19415 */
19416#define ADX_EMIT(a_Flag, a_Type, a_Max) \
19417 do \
19418 { \
19419 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
19420 a_Type uTmp = *puDst + uSrc; \
19421 if (uTmp < uSrc) \
19422 *pfEFlags |= (a_Flag); \
19423 else \
19424 *pfEFlags &= ~(a_Flag); \
19425 if ( uTmp == a_Max \
19426 && f) \
19427 *pfEFlags |= (a_Flag); \
19428 if (f) \
19429 uTmp++; \
19430 *puDst = uTmp; \
19431 } \
19432 while (0)
19433
19434IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19435{
19436 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19437}
19438
19439IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19440{
19441 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19442}
19443
19444# if defined(IEM_WITHOUT_ASSEMBLY)
19445
19446IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19447{
19448 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19449}
19450
19451IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19452{
19453 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19454}
19455
19456#endif
19457
19458
19459/**
19460 * ADOX
19461 */
19462IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19463{
19464 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19465}
19466
19467IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19468{
19469 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19470}
19471
19472# if defined(IEM_WITHOUT_ASSEMBLY)
19473
19474IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19475{
19476 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19477}
19478
19479IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19480{
19481 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19482}
19483
19484# endif
19485
19486
19487/**
19488 * MPSADBW
19489 */
19490IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19491{
19492 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19493 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19494 int16_t ai16Src1[11];
19495 int16_t ai16Src2[4];
19496
19497 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19498 ai16Src1[i] = puDst->au8[idxSrc1 + i];
19499
19500 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19501 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
19502
19503 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19504 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19505 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19506 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19507 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19508}
19509
19510
19511IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
19512{
19513 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19514 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19515 int16_t ai16Src1[11];
19516 int16_t ai16Src2[4];
19517
19518 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19519 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
19520
19521 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19522 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
19523
19524 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19525 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19526 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19527 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19528 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19529}
19530
19531
19532IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
19533{
19534 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
19535 RTUINT256U const uSrc2 = *puSrc2;
19536 ASMCompilerBarrier();
19537 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
19538 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
19539}
19540
19541
19542/**
19543 * VPERM2I128
19544 */
19545IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19546{
19547 if (bImm & RT_BIT(3))
19548 {
19549 puDst->au64[0] = 0;
19550 puDst->au64[1] = 0;
19551 }
19552 else
19553 {
19554 switch (bImm & 0x3)
19555 {
19556 case 0:
19557 puDst->au64[0] = puSrc1->au64[0];
19558 puDst->au64[1] = puSrc1->au64[1];
19559 break;
19560 case 1:
19561 puDst->au64[0] = puSrc1->au64[2];
19562 puDst->au64[1] = puSrc1->au64[3];
19563 break;
19564 case 2:
19565 puDst->au64[0] = puSrc2->au64[0];
19566 puDst->au64[1] = puSrc2->au64[1];
19567 break;
19568 case 3:
19569 puDst->au64[0] = puSrc2->au64[2];
19570 puDst->au64[1] = puSrc2->au64[3];
19571 break;
19572 }
19573 }
19574
19575 if (bImm & RT_BIT(7))
19576 {
19577 puDst->au64[2] = 0;
19578 puDst->au64[3] = 0;
19579 }
19580 else
19581 {
19582 switch ((bImm >> 4) & 0x3)
19583 {
19584 case 0:
19585 puDst->au64[2] = puSrc1->au64[0];
19586 puDst->au64[3] = puSrc1->au64[1];
19587 break;
19588 case 1:
19589 puDst->au64[2] = puSrc1->au64[2];
19590 puDst->au64[3] = puSrc1->au64[3];
19591 break;
19592 case 2:
19593 puDst->au64[2] = puSrc2->au64[0];
19594 puDst->au64[3] = puSrc2->au64[1];
19595 break;
19596 case 3:
19597 puDst->au64[2] = puSrc2->au64[2];
19598 puDst->au64[3] = puSrc2->au64[3];
19599 break;
19600 }
19601 }
19602}
19603
19604
19605/**
19606 * VPERM2F128
19607 */
19608IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19609{
19610 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
19611}
19612
19613
19614/**
19615 * DPPS
19616 */
19617IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19618{
19619 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
19620 AssertReleaseFailed();
19621}
19622
19623
19624/**
19625 * DPPD
19626 */
19627IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19628{
19629 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
19630 AssertReleaseFailed();
19631}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette