VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 103627

Last change on this file since 103627 was 103558, checked in by vboxsync, 10 months ago

VMM/IEM: Implement vpermilpd instruction emulations, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 734.7 KB
Line 
1/* $Id: IEMAllAImplC.cpp 103558 2024-02-24 11:06:53Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We clear AF, as that seems to make the most sense and also seems
138 * to be the correct behavior on current CPUs.
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT32U g_ar32One[];
464extern const RTFLOAT80U g_ar80One[];
465extern const RTFLOAT80U g_r80Indefinite;
466extern const RTFLOAT32U g_ar32Infinity[];
467extern const RTFLOAT64U g_ar64Infinity[];
468extern const RTFLOAT80U g_ar80Infinity[];
469extern const RTFLOAT128U g_r128Ln2;
470extern const RTUINT128U g_u128Ln2Mantissa;
471extern const RTUINT128U g_u128Ln2MantissaIntel;
472extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
473extern const RTFLOAT32U g_ar32QNaN[];
474extern const RTFLOAT64U g_ar64QNaN[];
475
476/** Zero values (indexed by fSign). */
477RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
478RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
479RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
480
481/** One values (indexed by fSign). */
482RTFLOAT32U const g_ar32One[] =
483{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
484RTFLOAT80U const g_ar80One[] =
485{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
486
487/** Indefinite (negative). */
488RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
489
490/** Infinities (indexed by fSign). */
491RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
492RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
493RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
494
495/** Default QNaNs (indexed by fSign). */
496RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
497RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
498
499
500#if 0
501/** 128-bit floating point constant: 2.0 */
502const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
503#endif
504
505
506/* The next section is generated by tools/IEMGenFpuConstants: */
507
508/** The ln2 constant as 128-bit floating point value.
509 * base-10: 6.93147180559945309417232121458176575e-1
510 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
511 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
512//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
513const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
514/** High precision ln2 value.
515 * base-10: 6.931471805599453094172321214581765680747e-1
516 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
517 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
518const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
519/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
520 * base-10: 6.931471805599453094151379470289064954613e-1
521 * base-16: b.17217f7d1cf79abc0000000000000000@-1
522 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
523const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
524
525/** Horner constants for f2xm1 */
526const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
527{
528 /* a0
529 * base-10: 1.00000000000000000000000000000000000e0
530 * base-16: 1.0000000000000000000000000000@0
531 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
532 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
533 /* a1
534 * base-10: 5.00000000000000000000000000000000000e-1
535 * base-16: 8.0000000000000000000000000000@-1
536 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
537 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
538 /* a2
539 * base-10: 1.66666666666666666666666666666666658e-1
540 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
541 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
542 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
543 /* a3
544 * base-10: 4.16666666666666666666666666666666646e-2
545 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
546 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
547 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
548 /* a4
549 * base-10: 8.33333333333333333333333333333333323e-3
550 * base-16: 2.2222222222222222222222222222@-2
551 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
552 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
553 /* a5
554 * base-10: 1.38888888888888888888888888888888874e-3
555 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
556 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
557 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
558 /* a6
559 * base-10: 1.98412698412698412698412698412698412e-4
560 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
561 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
562 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
563 /* a7
564 * base-10: 2.48015873015873015873015873015873015e-5
565 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
566 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
567 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
568 /* a8
569 * base-10: 2.75573192239858906525573192239858902e-6
570 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
571 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
572 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
573 /* a9
574 * base-10: 2.75573192239858906525573192239858865e-7
575 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
576 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
577 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
578 /* a10
579 * base-10: 2.50521083854417187750521083854417184e-8
580 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
581 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
582 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
583 /* a11
584 * base-10: 2.08767569878680989792100903212014296e-9
585 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
586 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
587 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
588 /* a12
589 * base-10: 1.60590438368216145993923771701549472e-10
590 * base-16: b.092309d43684be51c198e91d7b40@-9
591 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
592 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
593 /* a13
594 * base-10: 1.14707455977297247138516979786821043e-11
595 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
596 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
597 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
598 /* a14
599 * base-10: 7.64716373181981647590113198578806964e-13
600 * base-16: d.73f9f399dc0f88ec32b587746578@-11
601 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
602 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
603 /* a15
604 * base-10: 4.77947733238738529743820749111754352e-14
605 * base-16: d.73f9f399dc0f88ec32b587746578@-12
606 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
607 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
608 /* a16
609 * base-10: 2.81145725434552076319894558301031970e-15
610 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
611 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
612 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
613 /* a17
614 * base-10: 1.56192069685862264622163643500573321e-16
615 * base-16: b.413c31dcbecbbdd8024435161550@-14
616 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
617 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
618 /* a18
619 * base-10: 8.22063524662432971695598123687227980e-18
620 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
621 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
622 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
623 /* a19
624 * base-10: 4.11031762331216485847799061843614006e-19
625 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
626 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
627 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
628 /* a20
629 * base-10: 1.95729410633912612308475743735054143e-20
630 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
631 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
632 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
633 /* a21
634 * base-10: 8.89679139245057328674889744250246106e-22
635 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
636 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
637 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
638};
639
640
641/*
642 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
643 * it all in C is probably safer atm., optimize what's necessary later, maybe.
644 */
645#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
646
647
648/*********************************************************************************************************************************
649* Binary Operations *
650*********************************************************************************************************************************/
651
652/*
653 * ADD
654 */
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
657{
658 uint64_t uDst = *puDst;
659 uint64_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
662}
663
664# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
665
666IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
667{
668 uint32_t uDst = *puDst;
669 uint32_t uResult = uDst + uSrc;
670 *puDst = uResult;
671 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
672}
673
674
675IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
676{
677 uint16_t uDst = *puDst;
678 uint16_t uResult = uDst + uSrc;
679 *puDst = uResult;
680 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
681}
682
683
684IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
685{
686 uint8_t uDst = *puDst;
687 uint8_t uResult = uDst + uSrc;
688 *puDst = uResult;
689 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
690}
691
692# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
693
694/*
695 * ADC
696 */
697
698IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
699{
700 if (!(*pfEFlags & X86_EFL_CF))
701 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
702 else
703 {
704 uint64_t uDst = *puDst;
705 uint64_t uResult = uDst + uSrc + 1;
706 *puDst = uResult;
707 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
708 }
709}
710
711# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint32_t uDst = *puDst;
720 uint32_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
723 }
724}
725
726
727IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
728{
729 if (!(*pfEFlags & X86_EFL_CF))
730 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
731 else
732 {
733 uint16_t uDst = *puDst;
734 uint16_t uResult = uDst + uSrc + 1;
735 *puDst = uResult;
736 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
737 }
738}
739
740
741IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
742{
743 if (!(*pfEFlags & X86_EFL_CF))
744 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
745 else
746 {
747 uint8_t uDst = *puDst;
748 uint8_t uResult = uDst + uSrc + 1;
749 *puDst = uResult;
750 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
751 }
752}
753
754# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
755
756/*
757 * SUB
758 */
759# if !defined(RT_ARCH_ARM64)
760
761IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
762{
763 uint64_t uDst = *puDst;
764 uint64_t uResult = uDst - uSrc;
765 *puDst = uResult;
766 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
767}
768
769# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
770
771IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
772{
773 uint32_t uDst = *puDst;
774 uint32_t uResult = uDst - uSrc;
775 *puDst = uResult;
776 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
777}
778
779
780IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
781{
782 uint16_t uDst = *puDst;
783 uint16_t uResult = uDst - uSrc;
784 *puDst = uResult;
785 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
786}
787
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
790{
791 uint8_t uDst = *puDst;
792 uint8_t uResult = uDst - uSrc;
793 *puDst = uResult;
794 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
795}
796
797# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
798# endif /* !RT_ARCH_ARM64 */
799
800/*
801 * SBB
802 */
803
804IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
805{
806 if (!(*pfEFlags & X86_EFL_CF))
807 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
808 else
809 {
810 uint64_t uDst = *puDst;
811 uint64_t uResult = uDst - uSrc - 1;
812 *puDst = uResult;
813 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
814 }
815}
816
817# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
818
819IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
820{
821 if (!(*pfEFlags & X86_EFL_CF))
822 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
823 else
824 {
825 uint32_t uDst = *puDst;
826 uint32_t uResult = uDst - uSrc - 1;
827 *puDst = uResult;
828 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
829 }
830}
831
832
833IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
834{
835 if (!(*pfEFlags & X86_EFL_CF))
836 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
837 else
838 {
839 uint16_t uDst = *puDst;
840 uint16_t uResult = uDst - uSrc - 1;
841 *puDst = uResult;
842 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
843 }
844}
845
846
847IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
848{
849 if (!(*pfEFlags & X86_EFL_CF))
850 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
851 else
852 {
853 uint8_t uDst = *puDst;
854 uint8_t uResult = uDst - uSrc - 1;
855 *puDst = uResult;
856 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
857 }
858}
859
860# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
861
862
863/*
864 * OR
865 */
866
867IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
868{
869 uint64_t uResult = *puDst | uSrc;
870 *puDst = uResult;
871 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
872}
873
874# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
875
876IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
877{
878 uint32_t uResult = *puDst | uSrc;
879 *puDst = uResult;
880 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
881}
882
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
885{
886 uint16_t uResult = *puDst | uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
893{
894 uint8_t uResult = *puDst | uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
897}
898
899# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
900
901/*
902 * XOR
903 */
904
905IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
906{
907 uint64_t uResult = *puDst ^ uSrc;
908 *puDst = uResult;
909 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
910}
911
912# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
913
914IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
915{
916 uint32_t uResult = *puDst ^ uSrc;
917 *puDst = uResult;
918 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
919}
920
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
923{
924 uint16_t uResult = *puDst ^ uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
931{
932 uint8_t uResult = *puDst ^ uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
935}
936
937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
938
939/*
940 * AND
941 */
942
943IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
944{
945 uint64_t const uResult = *puDst & uSrc;
946 *puDst = uResult;
947 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
948}
949
950# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
953{
954 uint32_t const uResult = *puDst & uSrc;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
961{
962 uint16_t const uResult = *puDst & uSrc;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
965}
966
967
968IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
969{
970 uint8_t const uResult = *puDst & uSrc;
971 *puDst = uResult;
972 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
973}
974
975# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
976#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
977
978/*
979 * ANDN (BMI1 instruction)
980 */
981
982IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
983{
984 uint64_t const uResult = ~uSrc1 & uSrc2;
985 *puDst = uResult;
986 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
987}
988
989
990IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
991{
992 uint32_t const uResult = ~uSrc1 & uSrc2;
993 *puDst = uResult;
994 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
995}
996
997
998#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
999IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1000{
1001 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1002}
1003#endif
1004
1005
1006#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1007IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1008{
1009 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1010}
1011#endif
1012
1013#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1014
1015/*
1016 * CMP
1017 */
1018
1019IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1020{
1021 uint64_t uDstTmp = *puDst;
1022 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1023}
1024
1025# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1026
1027IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1028{
1029 uint32_t uDstTmp = *puDst;
1030 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1031}
1032
1033
1034IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1035{
1036 uint16_t uDstTmp = *puDst;
1037 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1038}
1039
1040
1041IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1042{
1043 uint8_t uDstTmp = *puDst;
1044 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1045}
1046
1047# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1048
1049/*
1050 * TEST
1051 */
1052
1053IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1054{
1055 uint64_t uResult = *puDst & uSrc;
1056 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
1057}
1058
1059# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1060
1061IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1062{
1063 uint32_t uResult = *puDst & uSrc;
1064 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
1065}
1066
1067
1068IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1069{
1070 uint16_t uResult = *puDst & uSrc;
1071 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
1072}
1073
1074
1075IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1076{
1077 uint8_t uResult = *puDst & uSrc;
1078 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
1079}
1080
1081# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1082
1083
1084/*
1085 * LOCK prefixed variants of the above
1086 */
1087
1088/** 64-bit locked binary operand operation. */
1089# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1090 do { \
1091 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1092 uint ## a_cBitsWidth ## _t uTmp; \
1093 uint32_t fEflTmp; \
1094 do \
1095 { \
1096 uTmp = uOld; \
1097 fEflTmp = *pfEFlags; \
1098 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1099 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1100 *pfEFlags = fEflTmp; \
1101 } while (0)
1102
1103
1104#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1105 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1106 uint ## a_cBitsWidth ## _t uSrc, \
1107 uint32_t *pfEFlags)) \
1108 { \
1109 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1110 }
1111
1112EMIT_LOCKED_BIN_OP(add, 64)
1113EMIT_LOCKED_BIN_OP(adc, 64)
1114EMIT_LOCKED_BIN_OP(sub, 64)
1115EMIT_LOCKED_BIN_OP(sbb, 64)
1116EMIT_LOCKED_BIN_OP(or, 64)
1117EMIT_LOCKED_BIN_OP(xor, 64)
1118EMIT_LOCKED_BIN_OP(and, 64)
1119# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1120EMIT_LOCKED_BIN_OP(add, 32)
1121EMIT_LOCKED_BIN_OP(adc, 32)
1122EMIT_LOCKED_BIN_OP(sub, 32)
1123EMIT_LOCKED_BIN_OP(sbb, 32)
1124EMIT_LOCKED_BIN_OP(or, 32)
1125EMIT_LOCKED_BIN_OP(xor, 32)
1126EMIT_LOCKED_BIN_OP(and, 32)
1127
1128EMIT_LOCKED_BIN_OP(add, 16)
1129EMIT_LOCKED_BIN_OP(adc, 16)
1130EMIT_LOCKED_BIN_OP(sub, 16)
1131EMIT_LOCKED_BIN_OP(sbb, 16)
1132EMIT_LOCKED_BIN_OP(or, 16)
1133EMIT_LOCKED_BIN_OP(xor, 16)
1134EMIT_LOCKED_BIN_OP(and, 16)
1135
1136EMIT_LOCKED_BIN_OP(add, 8)
1137EMIT_LOCKED_BIN_OP(adc, 8)
1138EMIT_LOCKED_BIN_OP(sub, 8)
1139EMIT_LOCKED_BIN_OP(sbb, 8)
1140EMIT_LOCKED_BIN_OP(or, 8)
1141EMIT_LOCKED_BIN_OP(xor, 8)
1142EMIT_LOCKED_BIN_OP(and, 8)
1143# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1144
1145
1146/*
1147 * Bit operations (same signature as above).
1148 */
1149
1150/*
1151 * BT
1152 */
1153
1154IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1155{
1156 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1157 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1158 Assert(uSrc < 64);
1159 uint64_t uDst = *puDst;
1160 if (uDst & RT_BIT_64(uSrc))
1161 *pfEFlags |= X86_EFL_CF;
1162 else
1163 *pfEFlags &= ~X86_EFL_CF;
1164}
1165
1166# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 32);
1173 uint32_t uDst = *puDst;
1174 if (uDst & RT_BIT_32(uSrc))
1175 *pfEFlags |= X86_EFL_CF;
1176 else
1177 *pfEFlags &= ~X86_EFL_CF;
1178}
1179
1180IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1181{
1182 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1183 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1184 Assert(uSrc < 16);
1185 uint16_t uDst = *puDst;
1186 if (uDst & RT_BIT_32(uSrc))
1187 *pfEFlags |= X86_EFL_CF;
1188 else
1189 *pfEFlags &= ~X86_EFL_CF;
1190}
1191
1192# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1193
1194/*
1195 * BTC
1196 */
1197
1198IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1199{
1200 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1201 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1202 Assert(uSrc < 64);
1203 uint64_t fMask = RT_BIT_64(uSrc);
1204 uint64_t uDst = *puDst;
1205 if (uDst & fMask)
1206 {
1207 uDst &= ~fMask;
1208 *puDst = uDst;
1209 *pfEFlags |= X86_EFL_CF;
1210 }
1211 else
1212 {
1213 uDst |= fMask;
1214 *puDst = uDst;
1215 *pfEFlags &= ~X86_EFL_CF;
1216 }
1217}
1218
1219# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1220
1221IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1222{
1223 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1224 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1225 Assert(uSrc < 32);
1226 uint32_t fMask = RT_BIT_32(uSrc);
1227 uint32_t uDst = *puDst;
1228 if (uDst & fMask)
1229 {
1230 uDst &= ~fMask;
1231 *puDst = uDst;
1232 *pfEFlags |= X86_EFL_CF;
1233 }
1234 else
1235 {
1236 uDst |= fMask;
1237 *puDst = uDst;
1238 *pfEFlags &= ~X86_EFL_CF;
1239 }
1240}
1241
1242
1243IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1244{
1245 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1246 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1247 Assert(uSrc < 16);
1248 uint16_t fMask = RT_BIT_32(uSrc);
1249 uint16_t uDst = *puDst;
1250 if (uDst & fMask)
1251 {
1252 uDst &= ~fMask;
1253 *puDst = uDst;
1254 *pfEFlags |= X86_EFL_CF;
1255 }
1256 else
1257 {
1258 uDst |= fMask;
1259 *puDst = uDst;
1260 *pfEFlags &= ~X86_EFL_CF;
1261 }
1262}
1263
1264# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1265
1266/*
1267 * BTR
1268 */
1269
1270IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1271{
1272 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1273 logical operation (AND/OR/whatever). */
1274 Assert(uSrc < 64);
1275 uint64_t fMask = RT_BIT_64(uSrc);
1276 uint64_t uDst = *puDst;
1277 if (uDst & fMask)
1278 {
1279 uDst &= ~fMask;
1280 *puDst = uDst;
1281 *pfEFlags |= X86_EFL_CF;
1282 }
1283 else
1284 *pfEFlags &= ~X86_EFL_CF;
1285}
1286
1287# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1288
1289IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1290{
1291 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1292 logical operation (AND/OR/whatever). */
1293 Assert(uSrc < 32);
1294 uint32_t fMask = RT_BIT_32(uSrc);
1295 uint32_t uDst = *puDst;
1296 if (uDst & fMask)
1297 {
1298 uDst &= ~fMask;
1299 *puDst = uDst;
1300 *pfEFlags |= X86_EFL_CF;
1301 }
1302 else
1303 *pfEFlags &= ~X86_EFL_CF;
1304}
1305
1306
1307IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1308{
1309 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1310 logical operation (AND/OR/whatever). */
1311 Assert(uSrc < 16);
1312 uint16_t fMask = RT_BIT_32(uSrc);
1313 uint16_t uDst = *puDst;
1314 if (uDst & fMask)
1315 {
1316 uDst &= ~fMask;
1317 *puDst = uDst;
1318 *pfEFlags |= X86_EFL_CF;
1319 }
1320 else
1321 *pfEFlags &= ~X86_EFL_CF;
1322}
1323
1324# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1325
1326/*
1327 * BTS
1328 */
1329
1330IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1331{
1332 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1333 logical operation (AND/OR/whatever). */
1334 Assert(uSrc < 64);
1335 uint64_t fMask = RT_BIT_64(uSrc);
1336 uint64_t uDst = *puDst;
1337 if (uDst & fMask)
1338 *pfEFlags |= X86_EFL_CF;
1339 else
1340 {
1341 uDst |= fMask;
1342 *puDst = uDst;
1343 *pfEFlags &= ~X86_EFL_CF;
1344 }
1345}
1346
1347# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1348
1349IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1350{
1351 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1352 logical operation (AND/OR/whatever). */
1353 Assert(uSrc < 32);
1354 uint32_t fMask = RT_BIT_32(uSrc);
1355 uint32_t uDst = *puDst;
1356 if (uDst & fMask)
1357 *pfEFlags |= X86_EFL_CF;
1358 else
1359 {
1360 uDst |= fMask;
1361 *puDst = uDst;
1362 *pfEFlags &= ~X86_EFL_CF;
1363 }
1364}
1365
1366
1367IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1368{
1369 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1370 logical operation (AND/OR/whatever). */
1371 Assert(uSrc < 16);
1372 uint16_t fMask = RT_BIT_32(uSrc);
1373 uint32_t uDst = *puDst;
1374 if (uDst & fMask)
1375 *pfEFlags |= X86_EFL_CF;
1376 else
1377 {
1378 uDst |= fMask;
1379 *puDst = uDst;
1380 *pfEFlags &= ~X86_EFL_CF;
1381 }
1382}
1383
1384# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1385
1386
1387EMIT_LOCKED_BIN_OP(btc, 64)
1388EMIT_LOCKED_BIN_OP(btr, 64)
1389EMIT_LOCKED_BIN_OP(bts, 64)
1390# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1391EMIT_LOCKED_BIN_OP(btc, 32)
1392EMIT_LOCKED_BIN_OP(btr, 32)
1393EMIT_LOCKED_BIN_OP(bts, 32)
1394
1395EMIT_LOCKED_BIN_OP(btc, 16)
1396EMIT_LOCKED_BIN_OP(btr, 16)
1397EMIT_LOCKED_BIN_OP(bts, 16)
1398# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1399
1400
1401/*
1402 * Helpers for BSR and BSF.
1403 *
1404 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1405 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1406 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1407 * but we restrict ourselves to emulating these recent marchs.
1408 */
1409#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1410 unsigned iBit = (a_iBit); \
1411 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1412 if (iBit) \
1413 { \
1414 *puDst = --iBit; \
1415 fEfl |= g_afParity[iBit]; \
1416 } \
1417 else \
1418 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1419 *pfEFlags = fEfl; \
1420 } while (0)
1421#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1422 unsigned const iBit = (a_iBit); \
1423 if (iBit) \
1424 { \
1425 *puDst = iBit - 1; \
1426 *pfEFlags &= ~X86_EFL_ZF; \
1427 } \
1428 else \
1429 *pfEFlags |= X86_EFL_ZF; \
1430 } while (0)
1431
1432
1433/*
1434 * BSF - first (least significant) bit set
1435 */
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1447{
1448 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1449}
1450
1451# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1464{
1465 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1466}
1467
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1480{
1481 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1482}
1483
1484# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1485
1486
1487/*
1488 * BSR - last (most significant) bit set
1489 */
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1501{
1502 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1503}
1504
1505# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1518{
1519 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1520}
1521
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1534{
1535 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1536}
1537
1538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1539
1540
1541/*
1542 * Helpers for LZCNT and TZCNT.
1543 */
1544#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1545 unsigned const uResult = (a_uResult); \
1546 *(a_puDst) = uResult; \
1547 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1548 if (uResult) \
1549 fEfl |= g_afParity[uResult]; \
1550 else \
1551 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1552 if (!a_uSrc) \
1553 fEfl |= X86_EFL_CF; \
1554 *(a_pfEFlags) = fEfl; \
1555 } while (0)
1556#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1557 unsigned const uResult = (a_uResult); \
1558 *(a_puDst) = uResult; \
1559 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1560 if (!uResult) \
1561 fEfl |= X86_EFL_ZF; \
1562 if (!a_uSrc) \
1563 fEfl |= X86_EFL_CF; \
1564 *(a_pfEFlags) = fEfl; \
1565 } while (0)
1566
1567
1568/*
1569 * LZCNT - count leading zero bits.
1570 */
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1582{
1583 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1584}
1585
1586# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1599{
1600 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1601}
1602
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1615{
1616 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1617}
1618
1619# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1620
1621
1622/*
1623 * TZCNT - count leading zero bits.
1624 */
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1636{
1637 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1638}
1639
1640# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1653{
1654 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1655}
1656
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1669{
1670 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1671}
1672
1673# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1674#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1675
1676/*
1677 * BEXTR (BMI1 instruction)
1678 */
1679#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1680IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1681 a_Type uSrc2, uint32_t *pfEFlags)) \
1682{ \
1683 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1684 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1685 a_Type uResult; \
1686 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1687 if (iFirstBit < a_cBits) \
1688 { \
1689 uResult = uSrc1 >> iFirstBit; \
1690 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1691 if (cBits < a_cBits) \
1692 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1693 *puDst = uResult; \
1694 if (!uResult) \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 else \
1698 { \
1699 *puDst = uResult = 0; \
1700 fEfl |= X86_EFL_ZF; \
1701 } \
1702 /** @todo complete flag calculations. */ \
1703 *pfEFlags = fEfl; \
1704}
1705
1706EMIT_BEXTR(64, uint64_t, _fallback)
1707EMIT_BEXTR(32, uint32_t, _fallback)
1708#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1709EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1710#endif
1711#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1712EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1713#endif
1714
1715/*
1716 * BLSR (BMI1 instruction)
1717 */
1718#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1719IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1720{ \
1721 uint32_t fEfl1 = *pfEFlags; \
1722 uint32_t fEfl2 = fEfl1; \
1723 *puDst = uSrc; \
1724 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1725 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1726 \
1727 /* AMD: The carry flag is from the SUB operation. */ \
1728 /* 10890xe: PF always cleared? */ \
1729 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1730 fEfl2 |= fEfl1 & X86_EFL_CF; \
1731 *pfEFlags = fEfl2; \
1732}
1733
1734EMIT_BLSR(64, uint64_t, _fallback)
1735EMIT_BLSR(32, uint32_t, _fallback)
1736#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1737EMIT_BLSR(64, uint64_t, RT_NOTHING)
1738#endif
1739#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1740EMIT_BLSR(32, uint32_t, RT_NOTHING)
1741#endif
1742
1743/*
1744 * BLSMSK (BMI1 instruction)
1745 */
1746#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1747IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1748{ \
1749 uint32_t fEfl1 = *pfEFlags; \
1750 uint32_t fEfl2 = fEfl1; \
1751 *puDst = uSrc; \
1752 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1753 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1754 \
1755 /* AMD: The carry flag is from the SUB operation. */ \
1756 /* 10890xe: PF always cleared? */ \
1757 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1758 fEfl2 |= fEfl1 & X86_EFL_CF; \
1759 *pfEFlags = fEfl2; \
1760}
1761
1762EMIT_BLSMSK(64, uint64_t, _fallback)
1763EMIT_BLSMSK(32, uint32_t, _fallback)
1764#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1765EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1766#endif
1767#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1768EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1769#endif
1770
1771/*
1772 * BLSI (BMI1 instruction)
1773 */
1774#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1775IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1776{ \
1777 uint32_t fEfl1 = *pfEFlags; \
1778 uint32_t fEfl2 = fEfl1; \
1779 *puDst = uSrc; \
1780 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1781 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1782 \
1783 /* AMD: The carry flag is from the SUB operation. */ \
1784 /* 10890xe: PF always cleared? */ \
1785 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1786 fEfl2 |= fEfl1 & X86_EFL_CF; \
1787 *pfEFlags = fEfl2; \
1788}
1789
1790EMIT_BLSI(64, uint64_t, _fallback)
1791EMIT_BLSI(32, uint32_t, _fallback)
1792#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1793EMIT_BLSI(64, uint64_t, RT_NOTHING)
1794#endif
1795#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1796EMIT_BLSI(32, uint32_t, RT_NOTHING)
1797#endif
1798
1799/*
1800 * BZHI (BMI2 instruction)
1801 */
1802#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1803IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1804 a_Type uSrc2, uint32_t *pfEFlags)) \
1805{ \
1806 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1807 a_Type uResult; \
1808 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1809 if (iFirstBit < a_cBits) \
1810 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1811 else \
1812 { \
1813 uResult = uSrc1; \
1814 fEfl |= X86_EFL_CF; \
1815 } \
1816 *puDst = uResult; \
1817 fEfl |= X86_EFL_CALC_ZF(uResult); \
1818 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1819 *pfEFlags = fEfl; \
1820}
1821
1822EMIT_BZHI(64, uint64_t, _fallback)
1823EMIT_BZHI(32, uint32_t, _fallback)
1824#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1825EMIT_BZHI(64, uint64_t, RT_NOTHING)
1826#endif
1827#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1828EMIT_BZHI(32, uint32_t, RT_NOTHING)
1829#endif
1830
1831/*
1832 * POPCNT
1833 */
1834RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1835{
1836 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1837 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1838 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1839 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1840};
1841
1842/** @todo Use native popcount where possible and employ some more efficient
1843 * algorithm here (or in asm.h fallback)! */
1844
1845DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1846{
1847 return g_abBitCounts6[ u16 & 0x3f]
1848 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1849 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1850}
1851
1852DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1853{
1854 return g_abBitCounts6[ u32 & 0x3f]
1855 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1856 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1857 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1858 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1859 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1860}
1861
1862DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1863{
1864 return g_abBitCounts6[ u64 & 0x3f]
1865 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1870 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1871 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1872 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1873 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1874 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1875}
1876
1877#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1878IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1879{ \
1880 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1881 a_Type uResult; \
1882 if (uSrc) \
1883 uResult = iemPopCountU ## a_cBits(uSrc); \
1884 else \
1885 { \
1886 fEfl |= X86_EFL_ZF; \
1887 uResult = 0; \
1888 } \
1889 *puDst = uResult; \
1890 *pfEFlags = fEfl; \
1891}
1892
1893EMIT_POPCNT(64, uint64_t, _fallback)
1894EMIT_POPCNT(32, uint32_t, _fallback)
1895EMIT_POPCNT(16, uint16_t, _fallback)
1896#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1897EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1898#endif
1899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1900EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1901EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1902#endif
1903
1904
1905#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1906
1907/*
1908 * XCHG
1909 */
1910
1911IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1912{
1913#if ARCH_BITS >= 64
1914 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1915#else
1916 uint64_t uOldMem = *puMem;
1917 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1918 ASMNopPause();
1919 *puReg = uOldMem;
1920#endif
1921}
1922
1923# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1924
1925IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1926{
1927 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1928}
1929
1930
1931IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1932{
1933 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1934}
1935
1936
1937IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1938{
1939 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1940}
1941
1942# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1943
1944
1945/* Unlocked variants for fDisregardLock mode: */
1946
1947IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1948{
1949 uint64_t const uOld = *puMem;
1950 *puMem = *puReg;
1951 *puReg = uOld;
1952}
1953
1954# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1955
1956IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1957{
1958 uint32_t const uOld = *puMem;
1959 *puMem = *puReg;
1960 *puReg = uOld;
1961}
1962
1963
1964IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1965{
1966 uint16_t const uOld = *puMem;
1967 *puMem = *puReg;
1968 *puReg = uOld;
1969}
1970
1971
1972IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1973{
1974 uint8_t const uOld = *puMem;
1975 *puMem = *puReg;
1976 *puReg = uOld;
1977}
1978
1979# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1980
1981
1982/*
1983 * XADD and LOCK XADD.
1984 */
1985#define EMIT_XADD(a_cBitsWidth, a_Type) \
1986IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1987{ \
1988 a_Type uDst = *puDst; \
1989 a_Type uResult = uDst; \
1990 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1991 *puDst = uResult; \
1992 *puReg = uDst; \
1993} \
1994\
1995IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1996{ \
1997 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1998 a_Type uResult; \
1999 uint32_t fEflTmp; \
2000 do \
2001 { \
2002 uResult = uOld; \
2003 fEflTmp = *pfEFlags; \
2004 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2005 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2006 *puReg = uOld; \
2007 *pfEFlags = fEflTmp; \
2008}
2009EMIT_XADD(64, uint64_t)
2010# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2011EMIT_XADD(32, uint32_t)
2012EMIT_XADD(16, uint16_t)
2013EMIT_XADD(8, uint8_t)
2014# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2015
2016#endif
2017
2018/*
2019 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2020 *
2021 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2022 * instructions are emulated as locked.
2023 */
2024#if defined(IEM_WITHOUT_ASSEMBLY)
2025
2026IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2027{
2028 uint8_t uOld = *puAl;
2029 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2030 Assert(*puAl == uOld);
2031 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2032}
2033
2034
2035IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2036{
2037 uint16_t uOld = *puAx;
2038 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2039 Assert(*puAx == uOld);
2040 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2041}
2042
2043
2044IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2045{
2046 uint32_t uOld = *puEax;
2047 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2048 Assert(*puEax == uOld);
2049 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2050}
2051
2052
2053# if ARCH_BITS == 32
2054IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2055# else
2056IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2057# endif
2058{
2059# if ARCH_BITS == 32
2060 uint64_t const uSrcReg = *puSrcReg;
2061# endif
2062 uint64_t uOld = *puRax;
2063 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2064 Assert(*puRax == uOld);
2065 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2066}
2067
2068
2069IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2070 uint32_t *pEFlags))
2071{
2072 uint64_t const uNew = pu64EbxEcx->u;
2073 uint64_t const uOld = pu64EaxEdx->u;
2074 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2075 {
2076 Assert(pu64EaxEdx->u == uOld);
2077 *pEFlags |= X86_EFL_ZF;
2078 }
2079 else
2080 *pEFlags &= ~X86_EFL_ZF;
2081}
2082
2083
2084# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2085IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2086 uint32_t *pEFlags))
2087{
2088# ifdef VBOX_STRICT
2089 RTUINT128U const uOld = *pu128RaxRdx;
2090# endif
2091# if defined(RT_ARCH_AMD64)
2092 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2093 &pu128RaxRdx->u))
2094# else
2095 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2096# endif
2097 {
2098 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2099 *pEFlags |= X86_EFL_ZF;
2100 }
2101 else
2102 *pEFlags &= ~X86_EFL_ZF;
2103}
2104# endif
2105
2106#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2107
2108# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2109IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2110 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2111{
2112 RTUINT128U u128Tmp = *pu128Dst;
2113 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2114 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2115 {
2116 *pu128Dst = *pu128RbxRcx;
2117 *pEFlags |= X86_EFL_ZF;
2118 }
2119 else
2120 {
2121 *pu128RaxRdx = u128Tmp;
2122 *pEFlags &= ~X86_EFL_ZF;
2123 }
2124}
2125#endif /* !RT_ARCH_ARM64 */
2126
2127#if defined(IEM_WITHOUT_ASSEMBLY)
2128
2129/* Unlocked versions mapped to the locked ones: */
2130
2131IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2132{
2133 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2134}
2135
2136
2137IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2138{
2139# if 0
2140 /* If correctly aligned, used the locked variation. */
2141 if (!((uintptr_t)pu16Dst & 1))
2142 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2143 else
2144# endif
2145 {
2146 /* Otherwise emulate it as best as we can. */
2147 uint16_t const uOld = *puAx;
2148 uint16_t const uDst = *pu16Dst;
2149 if (uOld == uDst)
2150 {
2151 *pu16Dst = uSrcReg;
2152 iemAImpl_cmp_u16(&uOld, uOld, pEFlags);
2153 }
2154 else
2155 {
2156 *puAx = uDst;
2157 iemAImpl_cmp_u16(&uOld, uDst, pEFlags);
2158 }
2159 }
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2164{
2165# if 0
2166 /* If correctly aligned, used the locked variation. */
2167 if (!((uintptr_t)pu32Dst & 3))
2168 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2169 else
2170# endif
2171 {
2172 /* Otherwise emulate it as best as we can. */
2173 uint32_t const uOld = *puEax;
2174 uint32_t const uDst = *pu32Dst;
2175 if (uOld == uDst)
2176 {
2177 *pu32Dst = uSrcReg;
2178 iemAImpl_cmp_u32(&uOld, uOld, pEFlags);
2179 }
2180 else
2181 {
2182 *puEax = uDst;
2183 iemAImpl_cmp_u32(&uOld, uDst, pEFlags);
2184 }
2185 }
2186}
2187
2188
2189# if ARCH_BITS == 32
2190IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2191{
2192# if 0
2193 /* If correctly aligned, used the locked variation. */
2194 if (!((uintptr_t)pu32Dst & 7))
2195 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2196 else
2197# endif
2198 {
2199 /* Otherwise emulate it as best as we can. */
2200 uint64_t const uOld = *puRax;
2201 uint64_t const uSrc = *puSrcReg;
2202 uint64_t const uDst = *pu64Dst;
2203 if (uOld == uDst)
2204 {
2205 *pu64Dst = uSrc;
2206 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2207 }
2208 else
2209 {
2210 *puRax = uDst;
2211 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2212 }
2213 }
2214}
2215# else /* ARCH_BITS != 32 */
2216IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2217{
2218# if 0
2219 /* If correctly aligned, used the locked variation. */
2220 if (!((uintptr_t)pu64Dst & 7))
2221 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2222 else
2223# endif
2224 {
2225 /* Otherwise emulate it as best as we can. */
2226 uint64_t const uOld = *puRax;
2227 uint64_t const uDst = *pu64Dst;
2228 if (uOld == uDst)
2229 {
2230 *pu64Dst = uSrcReg;
2231 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2232 }
2233 else
2234 {
2235 *puRax = uDst;
2236 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2237 }
2238 }
2239}
2240# endif /* ARCH_BITS != 32 */
2241
2242
2243IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2244{
2245# if 0
2246 /* If correctly aligned, used the locked variation. */
2247 if (!((uintptr_t)pu64Dst & 7))
2248 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2249 else
2250# endif
2251 {
2252 /* Otherwise emulate it as best as we can. */
2253 uint64_t const uNew = pu64EbxEcx->u;
2254 uint64_t const uOld = pu64EaxEdx->u;
2255 uint64_t const uDst = *pu64Dst;
2256 if (uDst == uOld)
2257 {
2258 *pu64Dst = uNew;
2259 *pEFlags |= X86_EFL_ZF;
2260 }
2261 else
2262 {
2263 pu64EaxEdx->u = uDst;
2264 *pEFlags &= ~X86_EFL_ZF;
2265 }
2266 }
2267}
2268
2269
2270IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2271 uint32_t *pEFlags))
2272{
2273# if 0
2274 /* If correctly aligned, used the locked variation. */
2275 if (!((uintptr_t)pu64Dst & 15))
2276 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2277 else
2278# endif
2279 {
2280 /* Otherwise emulate it as best as we can. */
2281# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2282 uint128_t const uNew = pu128RbxRcx->u;
2283 uint128_t const uOld = pu128RaxRdx->u;
2284 uint128_t const uDst = pu128Dst->u;
2285 if (uDst == uOld)
2286 {
2287 pu128Dst->u = uNew;
2288 *pEFlags |= X86_EFL_ZF;
2289 }
2290 else
2291 {
2292 pu128RaxRdx->u = uDst;
2293 *pEFlags &= ~X86_EFL_ZF;
2294 }
2295# else
2296 RTUINT128U const uNew = *pu128RbxRcx;
2297 RTUINT128U const uOld = *pu128RaxRdx;
2298 RTUINT128U const uDst = *pu128Dst;
2299 if ( uDst.s.Lo == uOld.s.Lo
2300 && uDst.s.Hi == uOld.s.Hi)
2301 {
2302 *pu128Dst = uNew;
2303 *pEFlags |= X86_EFL_ZF;
2304 }
2305 else
2306 {
2307 *pu128RaxRdx = uDst;
2308 *pEFlags &= ~X86_EFL_ZF;
2309 }
2310# endif
2311 }
2312}
2313
2314#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2315
2316#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2317 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2318
2319/*
2320 * MUL, IMUL, DIV and IDIV helpers.
2321 *
2322 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2323 * division step so we can select between using C operators and
2324 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2325 *
2326 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2327 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2328 * input loads and the result storing.
2329 */
2330
2331DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2332{
2333# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2334 pQuotient->s.Lo = 0;
2335 pQuotient->s.Hi = 0;
2336# endif
2337 RTUINT128U Divisor;
2338 Divisor.s.Lo = u64Divisor;
2339 Divisor.s.Hi = 0;
2340 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2341}
2342
2343# define DIV_LOAD(a_Dividend) \
2344 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2345# define DIV_LOAD_U8(a_Dividend) \
2346 a_Dividend.u = *puAX
2347
2348# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2349# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2350
2351# define MUL_LOAD_F1() *puA
2352# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2353
2354# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2355# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2356
2357# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2358 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2359# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2360 RTUInt128AssignNeg(&(a_Value))
2361
2362# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2363 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2364# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2365 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2366
2367# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2368 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2369 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2370# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2371 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2372
2373
2374/*
2375 * MUL
2376 */
2377# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2378IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2379{ \
2380 RTUINT ## a_cBitsWidth2x ## U Result; \
2381 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2382 a_fnStore(Result); \
2383 \
2384 /* Calc EFLAGS: */ \
2385 uint32_t fEfl = *pfEFlags; \
2386 if (a_fIntelFlags) \
2387 { /* Intel: 6700K and 10980XE behavior */ \
2388 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2389 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2390 fEfl |= X86_EFL_SF; \
2391 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2392 if (Result.s.Hi != 0) \
2393 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2394 } \
2395 else \
2396 { /* AMD: 3990X */ \
2397 if (Result.s.Hi != 0) \
2398 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2399 else \
2400 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2401 } \
2402 *pfEFlags = fEfl; \
2403 return 0; \
2404} \
2405
2406# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2407 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2408 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2409 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2410
2411# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2412EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2413 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2414# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2415EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2416 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2417EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2418 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2419EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2420 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2421# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2422# endif /* !DOXYGEN_RUNNING */
2423
2424/*
2425 * MULX
2426 */
2427# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2428IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2429 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2430{ \
2431 RTUINT ## a_cBitsWidth2x ## U Result; \
2432 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2433 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2434 *puDst1 = Result.s.Hi; \
2435} \
2436
2437# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2438EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2439EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2440# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2441EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2442EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2443# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2444# endif /* !DOXYGEN_RUNNING */
2445
2446
2447/*
2448 * IMUL
2449 *
2450 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2451 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2452 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2453 */
2454# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2455 a_Suffix, a_fIntelFlags) \
2456IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2457{ \
2458 RTUINT ## a_cBitsWidth2x ## U Result; \
2459 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2460 \
2461 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2462 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2463 { \
2464 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2465 { \
2466 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2467 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2468 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2469 } \
2470 else \
2471 { \
2472 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2473 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2474 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2475 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2476 a_fnNeg(Result, a_cBitsWidth2x); \
2477 } \
2478 } \
2479 else \
2480 { \
2481 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2482 { \
2483 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2484 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2485 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2486 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2487 a_fnNeg(Result, a_cBitsWidth2x); \
2488 } \
2489 else \
2490 { \
2491 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2492 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2493 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2494 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2495 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2496 } \
2497 } \
2498 a_fnStore(Result); \
2499 \
2500 if (a_fIntelFlags) \
2501 { \
2502 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2503 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2504 fEfl |= X86_EFL_SF; \
2505 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2506 } \
2507 *pfEFlags = fEfl; \
2508 return 0; \
2509}
2510# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2511 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2512 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2513 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2514
2515# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2516EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2517 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2518# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2519EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2520 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2521EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2522 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2523EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2524 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2525# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2526# endif /* !DOXYGEN_RUNNING */
2527
2528
2529/*
2530 * IMUL with two operands are mapped onto the three operand variant, ignoring
2531 * the high part of the product.
2532 */
2533# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2534IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2535{ \
2536 a_uType uIgn; \
2537 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2538} \
2539\
2540IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2541{ \
2542 a_uType uIgn; \
2543 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2544} \
2545\
2546IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2547{ \
2548 a_uType uIgn; \
2549 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2550}
2551
2552EMIT_IMUL_TWO(64, uint64_t)
2553# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2554EMIT_IMUL_TWO(32, uint32_t)
2555EMIT_IMUL_TWO(16, uint16_t)
2556# endif
2557
2558
2559/*
2560 * DIV
2561 */
2562# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2563 a_Suffix, a_fIntelFlags) \
2564IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2565{ \
2566 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2567 a_fnLoad(Dividend); \
2568 if ( uDivisor != 0 \
2569 && Dividend.s.Hi < uDivisor) \
2570 { \
2571 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2572 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2573 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2574 \
2575 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2576 if (!a_fIntelFlags) \
2577 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2578 return 0; \
2579 } \
2580 /* #DE */ \
2581 return -1; \
2582}
2583# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2584 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2585 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2586 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2587
2588# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2589EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2590 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2591# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2592EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2593 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2594EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2595 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2596EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2597 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2598# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2599# endif /* !DOXYGEN_RUNNING */
2600
2601
2602/*
2603 * IDIV
2604 *
2605 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2606 * set AF and clear PF, ZF and SF just like it does for DIV.
2607 *
2608 */
2609# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2610 a_Suffix, a_fIntelFlags) \
2611IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2612{ \
2613 /* Note! Skylake leaves all flags alone. */ \
2614 \
2615 /** @todo overflow checks */ \
2616 if (uDivisor != 0) \
2617 { \
2618 /* \
2619 * Convert to unsigned division. \
2620 */ \
2621 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2622 a_fnLoad(Dividend); \
2623 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2624 if (fSignedDividend) \
2625 a_fnNeg(Dividend, a_cBitsWidth2x); \
2626 \
2627 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2628 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2629 uDivisorPositive = uDivisor; \
2630 else \
2631 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2632 \
2633 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2634 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2635 \
2636 /* \
2637 * Setup the result, checking for overflows. \
2638 */ \
2639 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2640 { \
2641 if (!fSignedDividend) \
2642 { \
2643 /* Positive divisor, positive dividend => result positive. */ \
2644 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2645 { \
2646 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2647 if (!a_fIntelFlags) \
2648 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2649 return 0; \
2650 } \
2651 } \
2652 else \
2653 { \
2654 /* Positive divisor, negative dividend => result negative. */ \
2655 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2656 { \
2657 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2658 if (!a_fIntelFlags) \
2659 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2660 return 0; \
2661 } \
2662 } \
2663 } \
2664 else \
2665 { \
2666 if (!fSignedDividend) \
2667 { \
2668 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2669 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2670 { \
2671 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2672 if (!a_fIntelFlags) \
2673 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2674 return 0; \
2675 } \
2676 } \
2677 else \
2678 { \
2679 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2680 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2681 { \
2682 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2683 if (!a_fIntelFlags) \
2684 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2685 return 0; \
2686 } \
2687 } \
2688 } \
2689 } \
2690 /* #DE */ \
2691 return -1; \
2692}
2693# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2694 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2695 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2696 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2697
2698# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2699EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2700 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2701# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2702EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2703 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2704EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2705 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2706EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2707 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2708# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2709# endif /* !DOXYGEN_RUNNING */
2710
2711#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2712
2713
2714/*********************************************************************************************************************************
2715* Unary operations. *
2716*********************************************************************************************************************************/
2717#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2718
2719/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2720 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2721 *
2722 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2723 * borrowing in arithmetic loops on intel 8008).
2724 *
2725 * @returns Status bits.
2726 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2727 * @param a_uResult Unsigned result value.
2728 * @param a_uDst The original destination value (for AF calc).
2729 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2730 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2731 */
2732#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2733 do { \
2734 uint32_t fEflTmp = *(a_pfEFlags); \
2735 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2736 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2737 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2738 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2739 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2740 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2741 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2742 *(a_pfEFlags) = fEflTmp; \
2743 } while (0)
2744
2745/*
2746 * INC
2747 */
2748
2749IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2750{
2751 uint64_t uDst = *puDst;
2752 uint64_t uResult = uDst + 1;
2753 *puDst = uResult;
2754 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2755}
2756
2757# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2758
2759IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2760{
2761 uint32_t uDst = *puDst;
2762 uint32_t uResult = uDst + 1;
2763 *puDst = uResult;
2764 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2765}
2766
2767
2768IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2769{
2770 uint16_t uDst = *puDst;
2771 uint16_t uResult = uDst + 1;
2772 *puDst = uResult;
2773 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2774}
2775
2776IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2777{
2778 uint8_t uDst = *puDst;
2779 uint8_t uResult = uDst + 1;
2780 *puDst = uResult;
2781 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2782}
2783
2784# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2785
2786
2787/*
2788 * DEC
2789 */
2790
2791IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2792{
2793 uint64_t uDst = *puDst;
2794 uint64_t uResult = uDst - 1;
2795 *puDst = uResult;
2796 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2797}
2798
2799# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2800
2801IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2802{
2803 uint32_t uDst = *puDst;
2804 uint32_t uResult = uDst - 1;
2805 *puDst = uResult;
2806 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2807}
2808
2809
2810IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2811{
2812 uint16_t uDst = *puDst;
2813 uint16_t uResult = uDst - 1;
2814 *puDst = uResult;
2815 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2816}
2817
2818
2819IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2820{
2821 uint8_t uDst = *puDst;
2822 uint8_t uResult = uDst - 1;
2823 *puDst = uResult;
2824 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2825}
2826
2827# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2828
2829
2830/*
2831 * NOT
2832 */
2833
2834IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2835{
2836 uint64_t uDst = *puDst;
2837 uint64_t uResult = ~uDst;
2838 *puDst = uResult;
2839 /* EFLAGS are not modified. */
2840 RT_NOREF_PV(pfEFlags);
2841}
2842
2843# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2844
2845IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2846{
2847 uint32_t uDst = *puDst;
2848 uint32_t uResult = ~uDst;
2849 *puDst = uResult;
2850 /* EFLAGS are not modified. */
2851 RT_NOREF_PV(pfEFlags);
2852}
2853
2854IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2855{
2856 uint16_t uDst = *puDst;
2857 uint16_t uResult = ~uDst;
2858 *puDst = uResult;
2859 /* EFLAGS are not modified. */
2860 RT_NOREF_PV(pfEFlags);
2861}
2862
2863IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2864{
2865 uint8_t uDst = *puDst;
2866 uint8_t uResult = ~uDst;
2867 *puDst = uResult;
2868 /* EFLAGS are not modified. */
2869 RT_NOREF_PV(pfEFlags);
2870}
2871
2872# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2873
2874
2875/*
2876 * NEG
2877 */
2878
2879/**
2880 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2881 *
2882 * @returns Status bits.
2883 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2884 * @param a_uResult Unsigned result value.
2885 * @param a_uDst The original destination value (for AF calc).
2886 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2887 */
2888#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2889 do { \
2890 uint32_t fEflTmp = *(a_pfEFlags); \
2891 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2892 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2893 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2894 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2895 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2896 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2897 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2898 *(a_pfEFlags) = fEflTmp; \
2899 } while (0)
2900
2901IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2902{
2903 uint64_t uDst = *puDst;
2904 uint64_t uResult = (uint64_t)0 - uDst;
2905 *puDst = uResult;
2906 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2907}
2908
2909# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2910
2911IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2912{
2913 uint32_t uDst = *puDst;
2914 uint32_t uResult = (uint32_t)0 - uDst;
2915 *puDst = uResult;
2916 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2917}
2918
2919
2920IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2921{
2922 uint16_t uDst = *puDst;
2923 uint16_t uResult = (uint16_t)0 - uDst;
2924 *puDst = uResult;
2925 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2926}
2927
2928
2929IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2930{
2931 uint8_t uDst = *puDst;
2932 uint8_t uResult = (uint8_t)0 - uDst;
2933 *puDst = uResult;
2934 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2935}
2936
2937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2938
2939/*
2940 * Locked variants.
2941 */
2942
2943/** Emit a function for doing a locked unary operand operation. */
2944# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2945 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2946 uint32_t *pfEFlags)) \
2947 { \
2948 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2949 uint ## a_cBitsWidth ## _t uTmp; \
2950 uint32_t fEflTmp; \
2951 do \
2952 { \
2953 uTmp = uOld; \
2954 fEflTmp = *pfEFlags; \
2955 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2956 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2957 *pfEFlags = fEflTmp; \
2958 }
2959
2960EMIT_LOCKED_UNARY_OP(inc, 64)
2961EMIT_LOCKED_UNARY_OP(dec, 64)
2962EMIT_LOCKED_UNARY_OP(not, 64)
2963EMIT_LOCKED_UNARY_OP(neg, 64)
2964# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2965EMIT_LOCKED_UNARY_OP(inc, 32)
2966EMIT_LOCKED_UNARY_OP(dec, 32)
2967EMIT_LOCKED_UNARY_OP(not, 32)
2968EMIT_LOCKED_UNARY_OP(neg, 32)
2969
2970EMIT_LOCKED_UNARY_OP(inc, 16)
2971EMIT_LOCKED_UNARY_OP(dec, 16)
2972EMIT_LOCKED_UNARY_OP(not, 16)
2973EMIT_LOCKED_UNARY_OP(neg, 16)
2974
2975EMIT_LOCKED_UNARY_OP(inc, 8)
2976EMIT_LOCKED_UNARY_OP(dec, 8)
2977EMIT_LOCKED_UNARY_OP(not, 8)
2978EMIT_LOCKED_UNARY_OP(neg, 8)
2979# endif
2980
2981#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2982
2983
2984/*********************************************************************************************************************************
2985* Shifting and Rotating *
2986*********************************************************************************************************************************/
2987
2988/*
2989 * ROL
2990 */
2991#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2992IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2993{ \
2994 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2995 if (cShift) \
2996 { \
2997 if (a_cBitsWidth < 32) \
2998 cShift &= a_cBitsWidth - 1; \
2999 a_uType const uDst = *puDst; \
3000 a_uType const uResult = a_fnHlp(uDst, cShift); \
3001 *puDst = uResult; \
3002 \
3003 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3004 it the same way as for 1 bit shifts. */ \
3005 AssertCompile(X86_EFL_CF_BIT == 0); \
3006 uint32_t fEfl = *pfEFlags; \
3007 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3008 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3009 fEfl |= fCarry; \
3010 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3011 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3012 else /* Intel 10980XE: According to the first sub-shift: */ \
3013 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3014 *pfEFlags = fEfl; \
3015 } \
3016}
3017
3018#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3020#endif
3021EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3022EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3026#endif
3027EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3028EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3029
3030DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3031{
3032 return (uValue << cShift) | (uValue >> (16 - cShift));
3033}
3034#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3035EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3036#endif
3037EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3038EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3039
3040DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3041{
3042 return (uValue << cShift) | (uValue >> (8 - cShift));
3043}
3044#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3045EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3046#endif
3047EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3048EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3049
3050
3051/*
3052 * ROR
3053 */
3054#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3055IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3056{ \
3057 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3058 if (cShift) \
3059 { \
3060 if (a_cBitsWidth < 32) \
3061 cShift &= a_cBitsWidth - 1; \
3062 a_uType const uDst = *puDst; \
3063 a_uType const uResult = a_fnHlp(uDst, cShift); \
3064 *puDst = uResult; \
3065 \
3066 /* Calc EFLAGS: */ \
3067 AssertCompile(X86_EFL_CF_BIT == 0); \
3068 uint32_t fEfl = *pfEFlags; \
3069 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3070 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3071 fEfl |= fCarry; \
3072 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3073 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3074 else /* Intel 10980XE: According to the first sub-shift: */ \
3075 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3076 *pfEFlags = fEfl; \
3077 } \
3078}
3079
3080#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3081EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3082#endif
3083EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3084EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3085
3086#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3087EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3088#endif
3089EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3090EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3091
3092DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3093{
3094 return (uValue >> cShift) | (uValue << (16 - cShift));
3095}
3096#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3097EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3098#endif
3099EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3100EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3101
3102DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3103{
3104 return (uValue >> cShift) | (uValue << (8 - cShift));
3105}
3106#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3107EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3108#endif
3109EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3110EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3111
3112
3113/*
3114 * RCL
3115 */
3116#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3117IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3118{ \
3119 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3120 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3121 cShift %= a_cBitsWidth + 1; \
3122 if (cShift) \
3123 { \
3124 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3125 cShift %= a_cBitsWidth + 1; \
3126 a_uType const uDst = *puDst; \
3127 a_uType uResult = uDst << cShift; \
3128 if (cShift > 1) \
3129 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3130 \
3131 AssertCompile(X86_EFL_CF_BIT == 0); \
3132 uint32_t fEfl = *pfEFlags; \
3133 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3134 uResult |= (a_uType)fInCarry << (cShift - 1); \
3135 \
3136 *puDst = uResult; \
3137 \
3138 /* Calc EFLAGS. */ \
3139 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3140 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3141 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3142 fEfl |= fOutCarry; \
3143 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3144 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3145 else /* Intel 10980XE: According to the first sub-shift: */ \
3146 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3147 *pfEFlags = fEfl; \
3148 } \
3149}
3150
3151#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3152EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3153#endif
3154EMIT_RCL(64, uint64_t, _intel, 1)
3155EMIT_RCL(64, uint64_t, _amd, 0)
3156
3157#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3158EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3159#endif
3160EMIT_RCL(32, uint32_t, _intel, 1)
3161EMIT_RCL(32, uint32_t, _amd, 0)
3162
3163#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3164EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3165#endif
3166EMIT_RCL(16, uint16_t, _intel, 1)
3167EMIT_RCL(16, uint16_t, _amd, 0)
3168
3169#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3170EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3171#endif
3172EMIT_RCL(8, uint8_t, _intel, 1)
3173EMIT_RCL(8, uint8_t, _amd, 0)
3174
3175
3176/*
3177 * RCR
3178 */
3179#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3180IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3181{ \
3182 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3183 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3184 cShift %= a_cBitsWidth + 1; \
3185 if (cShift) \
3186 { \
3187 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3188 cShift %= a_cBitsWidth + 1; \
3189 a_uType const uDst = *puDst; \
3190 a_uType uResult = uDst >> cShift; \
3191 if (cShift > 1) \
3192 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3193 \
3194 AssertCompile(X86_EFL_CF_BIT == 0); \
3195 uint32_t fEfl = *pfEFlags; \
3196 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3197 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3198 *puDst = uResult; \
3199 \
3200 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3201 it the same way as for 1 bit shifts. */ \
3202 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3203 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3204 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3205 fEfl |= fOutCarry; \
3206 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3207 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3208 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3209 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3210 *pfEFlags = fEfl; \
3211 } \
3212}
3213
3214#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3215EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3216#endif
3217EMIT_RCR(64, uint64_t, _intel, 1)
3218EMIT_RCR(64, uint64_t, _amd, 0)
3219
3220#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3221EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3222#endif
3223EMIT_RCR(32, uint32_t, _intel, 1)
3224EMIT_RCR(32, uint32_t, _amd, 0)
3225
3226#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3227EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3228#endif
3229EMIT_RCR(16, uint16_t, _intel, 1)
3230EMIT_RCR(16, uint16_t, _amd, 0)
3231
3232#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3233EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3234#endif
3235EMIT_RCR(8, uint8_t, _intel, 1)
3236EMIT_RCR(8, uint8_t, _amd, 0)
3237
3238
3239/*
3240 * SHL
3241 */
3242#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3243IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3244{ \
3245 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3246 if (cShift) \
3247 { \
3248 a_uType const uDst = *puDst; \
3249 a_uType uResult = uDst << cShift; \
3250 *puDst = uResult; \
3251 \
3252 /* Calc EFLAGS. */ \
3253 AssertCompile(X86_EFL_CF_BIT == 0); \
3254 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3255 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3256 fEfl |= fCarry; \
3257 if (!a_fIntelFlags) \
3258 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3259 else \
3260 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3261 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3262 fEfl |= X86_EFL_CALC_ZF(uResult); \
3263 fEfl |= g_afParity[uResult & 0xff]; \
3264 if (!a_fIntelFlags) \
3265 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3266 *pfEFlags = fEfl; \
3267 } \
3268}
3269
3270#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3271EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3272#endif
3273EMIT_SHL(64, uint64_t, _intel, 1)
3274EMIT_SHL(64, uint64_t, _amd, 0)
3275
3276#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3277EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3278#endif
3279EMIT_SHL(32, uint32_t, _intel, 1)
3280EMIT_SHL(32, uint32_t, _amd, 0)
3281
3282#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3283EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3284#endif
3285EMIT_SHL(16, uint16_t, _intel, 1)
3286EMIT_SHL(16, uint16_t, _amd, 0)
3287
3288#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3289EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3290#endif
3291EMIT_SHL(8, uint8_t, _intel, 1)
3292EMIT_SHL(8, uint8_t, _amd, 0)
3293
3294
3295/*
3296 * SHR
3297 */
3298#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3299IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3300{ \
3301 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3302 if (cShift) \
3303 { \
3304 a_uType const uDst = *puDst; \
3305 a_uType uResult = uDst >> cShift; \
3306 *puDst = uResult; \
3307 \
3308 /* Calc EFLAGS. */ \
3309 AssertCompile(X86_EFL_CF_BIT == 0); \
3310 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3311 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3312 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3313 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3314 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3315 fEfl |= X86_EFL_CALC_ZF(uResult); \
3316 fEfl |= g_afParity[uResult & 0xff]; \
3317 if (!a_fIntelFlags) \
3318 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3319 *pfEFlags = fEfl; \
3320 } \
3321}
3322
3323#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3324EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3325#endif
3326EMIT_SHR(64, uint64_t, _intel, 1)
3327EMIT_SHR(64, uint64_t, _amd, 0)
3328
3329#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3330EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3331#endif
3332EMIT_SHR(32, uint32_t, _intel, 1)
3333EMIT_SHR(32, uint32_t, _amd, 0)
3334
3335#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3336EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3337#endif
3338EMIT_SHR(16, uint16_t, _intel, 1)
3339EMIT_SHR(16, uint16_t, _amd, 0)
3340
3341#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3342EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3343#endif
3344EMIT_SHR(8, uint8_t, _intel, 1)
3345EMIT_SHR(8, uint8_t, _amd, 0)
3346
3347
3348/*
3349 * SAR
3350 */
3351#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3352IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3353{ \
3354 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3355 if (cShift) \
3356 { \
3357 a_iType const iDst = (a_iType)*puDst; \
3358 a_uType uResult = iDst >> cShift; \
3359 *puDst = uResult; \
3360 \
3361 /* Calc EFLAGS. \
3362 Note! The OF flag is always zero because the result never differs from the input. */ \
3363 AssertCompile(X86_EFL_CF_BIT == 0); \
3364 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3365 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3366 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3367 fEfl |= X86_EFL_CALC_ZF(uResult); \
3368 fEfl |= g_afParity[uResult & 0xff]; \
3369 if (!a_fIntelFlags) \
3370 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3371 *pfEFlags = fEfl; \
3372 } \
3373}
3374
3375#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3376EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3377#endif
3378EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3379EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3380
3381#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3382EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3383#endif
3384EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3385EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3386
3387#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3388EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3389#endif
3390EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3391EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3392
3393#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3394EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3395#endif
3396EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3397EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3398
3399
3400/*
3401 * SHLD
3402 *
3403 * - CF is the last bit shifted out of puDst.
3404 * - AF is always cleared by Intel 10980XE.
3405 * - AF is always set by AMD 3990X.
3406 * - OF is set according to the first shift on Intel 10980XE, it seems.
3407 * - OF is set according to the last sub-shift on AMD 3990X.
3408 * - ZF, SF and PF are calculated according to the result by both vendors.
3409 *
3410 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3411 * pick either the source register or the destination register for input bits
3412 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3413 * intel has changed behaviour here several times. We implement what current
3414 * skylake based does for now, we can extend this later as needed.
3415 */
3416#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3417IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3418 uint32_t *pfEFlags)) \
3419{ \
3420 cShift &= a_cBitsWidth - 1; \
3421 if (cShift) \
3422 { \
3423 a_uType const uDst = *puDst; \
3424 a_uType uResult = uDst << cShift; \
3425 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3426 *puDst = uResult; \
3427 \
3428 /* CALC EFLAGS: */ \
3429 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3430 if (a_fIntelFlags) \
3431 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3432 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3433 else \
3434 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3435 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3436 fEfl |= X86_EFL_AF; \
3437 } \
3438 AssertCompile(X86_EFL_CF_BIT == 0); \
3439 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3440 fEfl |= g_afParity[uResult & 0xff]; \
3441 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3442 fEfl |= X86_EFL_CALC_ZF(uResult); \
3443 *pfEFlags = fEfl; \
3444 } \
3445}
3446
3447#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3448EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3449#endif
3450EMIT_SHLD(64, uint64_t, _intel, 1)
3451EMIT_SHLD(64, uint64_t, _amd, 0)
3452
3453#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3454EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3455#endif
3456EMIT_SHLD(32, uint32_t, _intel, 1)
3457EMIT_SHLD(32, uint32_t, _amd, 0)
3458
3459#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3460IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3461{ \
3462 cShift &= 31; \
3463 if (cShift) \
3464 { \
3465 uint16_t const uDst = *puDst; \
3466 uint64_t const uTmp = a_fIntelFlags \
3467 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3468 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3469 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3470 *puDst = uResult; \
3471 \
3472 /* CALC EFLAGS: */ \
3473 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3474 AssertCompile(X86_EFL_CF_BIT == 0); \
3475 if (a_fIntelFlags) \
3476 { \
3477 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3478 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3479 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3480 } \
3481 else \
3482 { \
3483 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3484 if (cShift < 16) \
3485 { \
3486 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3487 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3488 } \
3489 else \
3490 { \
3491 if (cShift == 16) \
3492 fEfl |= uDst & X86_EFL_CF; \
3493 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3494 } \
3495 fEfl |= X86_EFL_AF; \
3496 } \
3497 fEfl |= g_afParity[uResult & 0xff]; \
3498 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3499 fEfl |= X86_EFL_CALC_ZF(uResult); \
3500 *pfEFlags = fEfl; \
3501 } \
3502}
3503
3504#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3505EMIT_SHLD_16(RT_NOTHING, 1)
3506#endif
3507EMIT_SHLD_16(_intel, 1)
3508EMIT_SHLD_16(_amd, 0)
3509
3510
3511/*
3512 * SHRD
3513 *
3514 * EFLAGS behaviour seems to be the same as with SHLD:
3515 * - CF is the last bit shifted out of puDst.
3516 * - AF is always cleared by Intel 10980XE.
3517 * - AF is always set by AMD 3990X.
3518 * - OF is set according to the first shift on Intel 10980XE, it seems.
3519 * - OF is set according to the last sub-shift on AMD 3990X.
3520 * - ZF, SF and PF are calculated according to the result by both vendors.
3521 *
3522 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3523 * pick either the source register or the destination register for input bits
3524 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3525 * intel has changed behaviour here several times. We implement what current
3526 * skylake based does for now, we can extend this later as needed.
3527 */
3528#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3529IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3530{ \
3531 cShift &= a_cBitsWidth - 1; \
3532 if (cShift) \
3533 { \
3534 a_uType const uDst = *puDst; \
3535 a_uType uResult = uDst >> cShift; \
3536 uResult |= uSrc << (a_cBitsWidth - cShift); \
3537 *puDst = uResult; \
3538 \
3539 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3540 AssertCompile(X86_EFL_CF_BIT == 0); \
3541 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3542 if (a_fIntelFlags) \
3543 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3544 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3545 else \
3546 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3547 if (cShift > 1) /* Set according to last shift. */ \
3548 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3549 else \
3550 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3551 fEfl |= X86_EFL_AF; \
3552 } \
3553 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3554 fEfl |= X86_EFL_CALC_ZF(uResult); \
3555 fEfl |= g_afParity[uResult & 0xff]; \
3556 *pfEFlags = fEfl; \
3557 } \
3558}
3559
3560#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3561EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3562#endif
3563EMIT_SHRD(64, uint64_t, _intel, 1)
3564EMIT_SHRD(64, uint64_t, _amd, 0)
3565
3566#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3567EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3568#endif
3569EMIT_SHRD(32, uint32_t, _intel, 1)
3570EMIT_SHRD(32, uint32_t, _amd, 0)
3571
3572#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3573IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3574{ \
3575 cShift &= 31; \
3576 if (cShift) \
3577 { \
3578 uint16_t const uDst = *puDst; \
3579 uint64_t const uTmp = a_fIntelFlags \
3580 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3581 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3582 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3583 *puDst = uResult; \
3584 \
3585 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3586 AssertCompile(X86_EFL_CF_BIT == 0); \
3587 if (a_fIntelFlags) \
3588 { \
3589 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3590 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3591 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3592 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3593 } \
3594 else \
3595 { \
3596 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3597 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3598 /* AMD 3990X: Set according to last shift. AF always set. */ \
3599 if (cShift > 1) /* Set according to last shift. */ \
3600 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3601 else \
3602 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3603 fEfl |= X86_EFL_AF; \
3604 } \
3605 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3606 fEfl |= X86_EFL_CALC_ZF(uResult); \
3607 fEfl |= g_afParity[uResult & 0xff]; \
3608 *pfEFlags = fEfl; \
3609 } \
3610}
3611
3612#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3613EMIT_SHRD_16(RT_NOTHING, 1)
3614#endif
3615EMIT_SHRD_16(_intel, 1)
3616EMIT_SHRD_16(_amd, 0)
3617
3618
3619/*
3620 * RORX (BMI2)
3621 */
3622#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3623IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3624{ \
3625 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3626}
3627
3628#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3629EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3630#endif
3631#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3632EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3633#endif
3634
3635
3636/*
3637 * SHLX (BMI2)
3638 */
3639#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3640IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3641{ \
3642 cShift &= a_cBitsWidth - 1; \
3643 *puDst = uSrc << cShift; \
3644}
3645
3646#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3647EMIT_SHLX(64, uint64_t, RT_NOTHING)
3648EMIT_SHLX(64, uint64_t, _fallback)
3649#endif
3650#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3651EMIT_SHLX(32, uint32_t, RT_NOTHING)
3652EMIT_SHLX(32, uint32_t, _fallback)
3653#endif
3654
3655
3656/*
3657 * SHRX (BMI2)
3658 */
3659#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3660IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3661{ \
3662 cShift &= a_cBitsWidth - 1; \
3663 *puDst = uSrc >> cShift; \
3664}
3665
3666#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3667EMIT_SHRX(64, uint64_t, RT_NOTHING)
3668EMIT_SHRX(64, uint64_t, _fallback)
3669#endif
3670#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3671EMIT_SHRX(32, uint32_t, RT_NOTHING)
3672EMIT_SHRX(32, uint32_t, _fallback)
3673#endif
3674
3675
3676/*
3677 * SARX (BMI2)
3678 */
3679#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3680IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3681{ \
3682 cShift &= a_cBitsWidth - 1; \
3683 *puDst = (a_iType)uSrc >> cShift; \
3684}
3685
3686#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3687EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3688EMIT_SARX(64, uint64_t, int64_t, _fallback)
3689#endif
3690#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3691EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3692EMIT_SARX(32, uint32_t, int32_t, _fallback)
3693#endif
3694
3695
3696/*
3697 * PDEP (BMI2)
3698 */
3699#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3700IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3701{ \
3702 a_uType uResult = 0; \
3703 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3704 if (fMask & ((a_uType)1 << iMaskBit)) \
3705 { \
3706 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3707 iBit++; \
3708 } \
3709 *puDst = uResult; \
3710}
3711
3712#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3713EMIT_PDEP(64, uint64_t, RT_NOTHING)
3714#endif
3715EMIT_PDEP(64, uint64_t, _fallback)
3716#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3717EMIT_PDEP(32, uint32_t, RT_NOTHING)
3718#endif
3719EMIT_PDEP(32, uint32_t, _fallback)
3720
3721/*
3722 * PEXT (BMI2)
3723 */
3724#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3725IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3726{ \
3727 a_uType uResult = 0; \
3728 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3729 if (fMask & ((a_uType)1 << iMaskBit)) \
3730 { \
3731 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3732 iBit++; \
3733 } \
3734 *puDst = uResult; \
3735}
3736
3737#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3738EMIT_PEXT(64, uint64_t, RT_NOTHING)
3739#endif
3740EMIT_PEXT(64, uint64_t, _fallback)
3741#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3742EMIT_PEXT(32, uint32_t, RT_NOTHING)
3743#endif
3744EMIT_PEXT(32, uint32_t, _fallback)
3745
3746
3747#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3748
3749# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3750/*
3751 * BSWAP
3752 */
3753
3754IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3755{
3756 *puDst = ASMByteSwapU64(*puDst);
3757}
3758
3759
3760IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3761{
3762 *puDst = ASMByteSwapU32(*puDst);
3763}
3764
3765
3766/* Note! undocument, so 32-bit arg */
3767IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3768{
3769#if 0
3770 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3771#else
3772 /* This is the behaviour AMD 3990x (64-bit mode): */
3773 *(uint16_t *)puDst = 0;
3774#endif
3775}
3776
3777# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3778
3779
3780
3781# if defined(IEM_WITHOUT_ASSEMBLY)
3782
3783/*
3784 * LFENCE, SFENCE & MFENCE.
3785 */
3786
3787IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3788{
3789 ASMReadFence();
3790}
3791
3792
3793IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3794{
3795 ASMWriteFence();
3796}
3797
3798
3799IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3800{
3801 ASMMemoryFence();
3802}
3803
3804
3805# ifndef RT_ARCH_ARM64
3806IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3807{
3808 ASMMemoryFence();
3809}
3810# endif
3811
3812# endif
3813
3814#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3815
3816
3817IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3818{
3819 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3820 {
3821 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3822 *pu16Dst |= u16Src & X86_SEL_RPL;
3823
3824 *pfEFlags |= X86_EFL_ZF;
3825 }
3826 else
3827 *pfEFlags &= ~X86_EFL_ZF;
3828}
3829
3830
3831#if defined(IEM_WITHOUT_ASSEMBLY)
3832
3833/*********************************************************************************************************************************
3834* x87 FPU Loads *
3835*********************************************************************************************************************************/
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3838{
3839 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3840 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3841 {
3842 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3843 pFpuRes->r80Result.sj64.fInteger = 1;
3844 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3845 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3846 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3847 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3848 }
3849 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3850 {
3851 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3852 pFpuRes->r80Result.s.uExponent = 0;
3853 pFpuRes->r80Result.s.uMantissa = 0;
3854 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3855 }
3856 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3857 {
3858 /* Subnormal values gets normalized. */
3859 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3860 pFpuRes->r80Result.sj64.fInteger = 1;
3861 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3862 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3863 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3864 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3865 pFpuRes->FSW |= X86_FSW_DE;
3866 if (!(pFpuState->FCW & X86_FCW_DM))
3867 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3868 }
3869 else if (RTFLOAT32U_IS_INF(pr32Val))
3870 {
3871 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3872 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3873 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3874 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3875 }
3876 else
3877 {
3878 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3879 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3880 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3881 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3882 pFpuRes->r80Result.sj64.fInteger = 1;
3883 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3884 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3885 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3886 {
3887 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3888 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3889 pFpuRes->FSW |= X86_FSW_IE;
3890
3891 if (!(pFpuState->FCW & X86_FCW_IM))
3892 {
3893 /* The value is not pushed. */
3894 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3895 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3896 pFpuRes->r80Result.au64[0] = 0;
3897 pFpuRes->r80Result.au16[4] = 0;
3898 }
3899 }
3900 else
3901 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3902 }
3903}
3904
3905
3906IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3907{
3908 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3909 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3910 {
3911 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3912 pFpuRes->r80Result.sj64.fInteger = 1;
3913 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3914 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3915 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3916 }
3917 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3918 {
3919 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3920 pFpuRes->r80Result.s.uExponent = 0;
3921 pFpuRes->r80Result.s.uMantissa = 0;
3922 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3923 }
3924 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3925 {
3926 /* Subnormal values gets normalized. */
3927 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3928 pFpuRes->r80Result.sj64.fInteger = 1;
3929 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3930 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3931 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3932 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3933 pFpuRes->FSW |= X86_FSW_DE;
3934 if (!(pFpuState->FCW & X86_FCW_DM))
3935 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3936 }
3937 else if (RTFLOAT64U_IS_INF(pr64Val))
3938 {
3939 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3940 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3941 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3942 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3943 }
3944 else
3945 {
3946 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3947 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3948 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3949 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3950 pFpuRes->r80Result.sj64.fInteger = 1;
3951 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3952 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3953 {
3954 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3955 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3956 pFpuRes->FSW |= X86_FSW_IE;
3957
3958 if (!(pFpuState->FCW & X86_FCW_IM))
3959 {
3960 /* The value is not pushed. */
3961 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3962 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3963 pFpuRes->r80Result.au64[0] = 0;
3964 pFpuRes->r80Result.au16[4] = 0;
3965 }
3966 }
3967 else
3968 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3969 }
3970}
3971
3972
3973IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3974{
3975 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3976 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3977 /* Raises no exceptions. */
3978 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3979}
3980
3981
3982IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3983{
3984 pFpuRes->r80Result.sj64.fSign = 0;
3985 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3986 pFpuRes->r80Result.sj64.fInteger = 1;
3987 pFpuRes->r80Result.sj64.uFraction = 0;
3988
3989 /*
3990 * FPU status word:
3991 * - TOP is irrelevant, but we must match x86 assembly version.
3992 * - C1 is always cleared as we don't have any stack overflows.
3993 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3994 */
3995 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3996}
3997
3998
3999IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4000{
4001 pFpuRes->r80Result.sj64.fSign = 0;
4002 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4003 pFpuRes->r80Result.sj64.fInteger = 1;
4004 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4005 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4006 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4007 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4008}
4009
4010
4011IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4012{
4013 pFpuRes->r80Result.sj64.fSign = 0;
4014 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4015 pFpuRes->r80Result.sj64.fInteger = 1;
4016 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4017 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4018 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4019}
4020
4021
4022IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4023{
4024 pFpuRes->r80Result.sj64.fSign = 0;
4025 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4026 pFpuRes->r80Result.sj64.fInteger = 1;
4027 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4028 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4029 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4030 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4031}
4032
4033
4034IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4035{
4036 pFpuRes->r80Result.sj64.fSign = 0;
4037 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4038 pFpuRes->r80Result.sj64.fInteger = 1;
4039 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4040 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4041 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4042 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4043}
4044
4045
4046IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4047{
4048 pFpuRes->r80Result.sj64.fSign = 0;
4049 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4050 pFpuRes->r80Result.sj64.fInteger = 1;
4051 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4052 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4053 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4054 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4055}
4056
4057
4058IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4059{
4060 pFpuRes->r80Result.s.fSign = 0;
4061 pFpuRes->r80Result.s.uExponent = 0;
4062 pFpuRes->r80Result.s.uMantissa = 0;
4063 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4064}
4065
4066#define EMIT_FILD(a_cBits) \
4067IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4068 int ## a_cBits ## _t const *piVal)) \
4069{ \
4070 int ## a_cBits ## _t iVal = *piVal; \
4071 if (iVal == 0) \
4072 { \
4073 pFpuRes->r80Result.s.fSign = 0; \
4074 pFpuRes->r80Result.s.uExponent = 0; \
4075 pFpuRes->r80Result.s.uMantissa = 0; \
4076 } \
4077 else \
4078 { \
4079 if (iVal > 0) \
4080 pFpuRes->r80Result.s.fSign = 0; \
4081 else \
4082 { \
4083 pFpuRes->r80Result.s.fSign = 1; \
4084 iVal = -iVal; \
4085 } \
4086 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4087 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4088 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4089 } \
4090 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4091}
4092EMIT_FILD(16)
4093EMIT_FILD(32)
4094EMIT_FILD(64)
4095
4096
4097IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4098{
4099 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4100 if ( pd80Val->s.abPairs[0] == 0
4101 && pd80Val->s.abPairs[1] == 0
4102 && pd80Val->s.abPairs[2] == 0
4103 && pd80Val->s.abPairs[3] == 0
4104 && pd80Val->s.abPairs[4] == 0
4105 && pd80Val->s.abPairs[5] == 0
4106 && pd80Val->s.abPairs[6] == 0
4107 && pd80Val->s.abPairs[7] == 0
4108 && pd80Val->s.abPairs[8] == 0)
4109 {
4110 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4111 pFpuRes->r80Result.s.uExponent = 0;
4112 pFpuRes->r80Result.s.uMantissa = 0;
4113 }
4114 else
4115 {
4116 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4117
4118 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4119 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4120 cPairs--;
4121
4122 uint64_t uVal = 0;
4123 uint64_t uFactor = 1;
4124 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4125 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4126 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4127
4128 unsigned const cBits = ASMBitLastSetU64(uVal);
4129 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4130 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4131 }
4132}
4133
4134
4135/*********************************************************************************************************************************
4136* x87 FPU Stores *
4137*********************************************************************************************************************************/
4138
4139/**
4140 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4141 *
4142 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4143 *
4144 * @returns Updated FPU status word value.
4145 * @param fSignIn Incoming sign indicator.
4146 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4147 * @param iExponentIn Unbiased exponent.
4148 * @param fFcw The FPU control word.
4149 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4150 * @param pr32Dst Where to return the output value, if one should be
4151 * returned.
4152 *
4153 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4154 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4155 */
4156static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4157 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4158{
4159 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4160 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4161 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4162 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4163 ? fRoundingOffMask
4164 : 0;
4165 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4166
4167 /*
4168 * Deal with potential overflows/underflows first, optimizing for none.
4169 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4170 */
4171 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4172 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4173 { /* likely? */ }
4174 /*
4175 * Underflow if the exponent zero or negative. This is attempted mapped
4176 * to a subnormal number when possible, with some additional trickery ofc.
4177 */
4178 else if (iExponentOut <= 0)
4179 {
4180 bool const fIsTiny = iExponentOut < 0
4181 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4182 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4183 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4184 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4185
4186 if (iExponentOut <= 0)
4187 {
4188 uMantissaIn = iExponentOut <= -63
4189 ? uMantissaIn != 0
4190 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4191 fRoundedOff = uMantissaIn & fRoundingOffMask;
4192 if (fRoundedOff && fIsTiny)
4193 fFsw |= X86_FSW_UE;
4194 iExponentOut = 0;
4195 }
4196 }
4197 /*
4198 * Overflow if at or above max exponent value or if we will reach max
4199 * when rounding. Will return +/-zero or +/-max value depending on
4200 * whether we're rounding or not.
4201 */
4202 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4203 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4204 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4205 {
4206 fFsw |= X86_FSW_OE;
4207 if (!(fFcw & X86_FCW_OM))
4208 return fFsw | X86_FSW_ES | X86_FSW_B;
4209 fFsw |= X86_FSW_PE;
4210 if (uRoundingAdd)
4211 fFsw |= X86_FSW_C1;
4212 if (!(fFcw & X86_FCW_PM))
4213 fFsw |= X86_FSW_ES | X86_FSW_B;
4214
4215 pr32Dst->s.fSign = fSignIn;
4216 if (uRoundingAdd)
4217 { /* Zero */
4218 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4219 pr32Dst->s.uFraction = 0;
4220 }
4221 else
4222 { /* Max */
4223 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4224 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4225 }
4226 return fFsw;
4227 }
4228
4229 /*
4230 * Normal or subnormal number.
4231 */
4232 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4233 uint64_t uMantissaOut = uMantissaIn;
4234 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4235 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4236 || fRoundedOff != uRoundingAdd)
4237 {
4238 uMantissaOut = uMantissaIn + uRoundingAdd;
4239 if (uMantissaOut >= uMantissaIn)
4240 { /* likely */ }
4241 else
4242 {
4243 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4244 iExponentOut++;
4245 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4246 fFsw |= X86_FSW_C1;
4247 }
4248 }
4249 else
4250 uMantissaOut = uMantissaIn;
4251
4252 /* Truncate the mantissa and set the return value. */
4253 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4254
4255 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4256 pr32Dst->s.uExponent = iExponentOut;
4257 pr32Dst->s.fSign = fSignIn;
4258
4259 /* Set status flags realted to rounding. */
4260 if (fRoundedOff)
4261 {
4262 fFsw |= X86_FSW_PE;
4263 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4264 fFsw |= X86_FSW_C1;
4265 if (!(fFcw & X86_FCW_PM))
4266 fFsw |= X86_FSW_ES | X86_FSW_B;
4267 }
4268
4269 return fFsw;
4270}
4271
4272
4273/**
4274 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4275 */
4276IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4277 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4278{
4279 uint16_t const fFcw = pFpuState->FCW;
4280 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4281 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4282 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4283 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4284 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4285 {
4286 pr32Dst->s.fSign = pr80Src->s.fSign;
4287 pr32Dst->s.uExponent = 0;
4288 pr32Dst->s.uFraction = 0;
4289 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4290 }
4291 else if (RTFLOAT80U_IS_INF(pr80Src))
4292 {
4293 pr32Dst->s.fSign = pr80Src->s.fSign;
4294 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4295 pr32Dst->s.uFraction = 0;
4296 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4297 }
4298 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4299 {
4300 /* Mapped to +/-QNaN */
4301 pr32Dst->s.fSign = pr80Src->s.fSign;
4302 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4303 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4304 }
4305 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4306 {
4307 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4308 if (fFcw & X86_FCW_IM)
4309 {
4310 pr32Dst->s.fSign = 1;
4311 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4312 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4313 fFsw |= X86_FSW_IE;
4314 }
4315 else
4316 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4317 }
4318 else if (RTFLOAT80U_IS_NAN(pr80Src))
4319 {
4320 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4321 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4322 {
4323 pr32Dst->s.fSign = pr80Src->s.fSign;
4324 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4325 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4326 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4327 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4328 fFsw |= X86_FSW_IE;
4329 }
4330 else
4331 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4332 }
4333 else
4334 {
4335 /* Denormal values causes both an underflow and precision exception. */
4336 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4337 if (fFcw & X86_FCW_UM)
4338 {
4339 pr32Dst->s.fSign = pr80Src->s.fSign;
4340 pr32Dst->s.uExponent = 0;
4341 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4342 {
4343 pr32Dst->s.uFraction = 1;
4344 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4345 if (!(fFcw & X86_FCW_PM))
4346 fFsw |= X86_FSW_ES | X86_FSW_B;
4347 }
4348 else
4349 {
4350 pr32Dst->s.uFraction = 0;
4351 fFsw |= X86_FSW_UE | X86_FSW_PE;
4352 if (!(fFcw & X86_FCW_PM))
4353 fFsw |= X86_FSW_ES | X86_FSW_B;
4354 }
4355 }
4356 else
4357 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4358 }
4359 *pu16FSW = fFsw;
4360}
4361
4362
4363/**
4364 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4365 *
4366 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4367 *
4368 * @returns Updated FPU status word value.
4369 * @param fSignIn Incoming sign indicator.
4370 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4371 * @param iExponentIn Unbiased exponent.
4372 * @param fFcw The FPU control word.
4373 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4374 * @param pr64Dst Where to return the output value, if one should be
4375 * returned.
4376 *
4377 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4378 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4379 */
4380static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4381 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4382{
4383 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4384 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4385 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4386 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4387 ? fRoundingOffMask
4388 : 0;
4389 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4390
4391 /*
4392 * Deal with potential overflows/underflows first, optimizing for none.
4393 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4394 */
4395 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4396 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4397 { /* likely? */ }
4398 /*
4399 * Underflow if the exponent zero or negative. This is attempted mapped
4400 * to a subnormal number when possible, with some additional trickery ofc.
4401 */
4402 else if (iExponentOut <= 0)
4403 {
4404 bool const fIsTiny = iExponentOut < 0
4405 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4406 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4407 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4408 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4409
4410 if (iExponentOut <= 0)
4411 {
4412 uMantissaIn = iExponentOut <= -63
4413 ? uMantissaIn != 0
4414 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4415 fRoundedOff = uMantissaIn & fRoundingOffMask;
4416 if (fRoundedOff && fIsTiny)
4417 fFsw |= X86_FSW_UE;
4418 iExponentOut = 0;
4419 }
4420 }
4421 /*
4422 * Overflow if at or above max exponent value or if we will reach max
4423 * when rounding. Will return +/-zero or +/-max value depending on
4424 * whether we're rounding or not.
4425 */
4426 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4427 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4428 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4429 {
4430 fFsw |= X86_FSW_OE;
4431 if (!(fFcw & X86_FCW_OM))
4432 return fFsw | X86_FSW_ES | X86_FSW_B;
4433 fFsw |= X86_FSW_PE;
4434 if (uRoundingAdd)
4435 fFsw |= X86_FSW_C1;
4436 if (!(fFcw & X86_FCW_PM))
4437 fFsw |= X86_FSW_ES | X86_FSW_B;
4438
4439 pr64Dst->s64.fSign = fSignIn;
4440 if (uRoundingAdd)
4441 { /* Zero */
4442 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4443 pr64Dst->s64.uFraction = 0;
4444 }
4445 else
4446 { /* Max */
4447 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4448 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4449 }
4450 return fFsw;
4451 }
4452
4453 /*
4454 * Normal or subnormal number.
4455 */
4456 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4457 uint64_t uMantissaOut = uMantissaIn;
4458 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4459 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4460 || fRoundedOff != uRoundingAdd)
4461 {
4462 uMantissaOut = uMantissaIn + uRoundingAdd;
4463 if (uMantissaOut >= uMantissaIn)
4464 { /* likely */ }
4465 else
4466 {
4467 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4468 iExponentOut++;
4469 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4470 fFsw |= X86_FSW_C1;
4471 }
4472 }
4473 else
4474 uMantissaOut = uMantissaIn;
4475
4476 /* Truncate the mantissa and set the return value. */
4477 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4478
4479 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4480 pr64Dst->s64.uExponent = iExponentOut;
4481 pr64Dst->s64.fSign = fSignIn;
4482
4483 /* Set status flags realted to rounding. */
4484 if (fRoundedOff)
4485 {
4486 fFsw |= X86_FSW_PE;
4487 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4488 fFsw |= X86_FSW_C1;
4489 if (!(fFcw & X86_FCW_PM))
4490 fFsw |= X86_FSW_ES | X86_FSW_B;
4491 }
4492
4493 return fFsw;
4494}
4495
4496
4497/**
4498 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4499 */
4500IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4501 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4502{
4503 uint16_t const fFcw = pFpuState->FCW;
4504 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4505 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4506 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4507 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4508 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4509 {
4510 pr64Dst->s64.fSign = pr80Src->s.fSign;
4511 pr64Dst->s64.uExponent = 0;
4512 pr64Dst->s64.uFraction = 0;
4513 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4514 }
4515 else if (RTFLOAT80U_IS_INF(pr80Src))
4516 {
4517 pr64Dst->s64.fSign = pr80Src->s.fSign;
4518 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4519 pr64Dst->s64.uFraction = 0;
4520 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4521 }
4522 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4523 {
4524 /* Mapped to +/-QNaN */
4525 pr64Dst->s64.fSign = pr80Src->s.fSign;
4526 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4527 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4528 }
4529 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4530 {
4531 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4532 if (fFcw & X86_FCW_IM)
4533 {
4534 pr64Dst->s64.fSign = 1;
4535 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4536 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4537 fFsw |= X86_FSW_IE;
4538 }
4539 else
4540 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4541 }
4542 else if (RTFLOAT80U_IS_NAN(pr80Src))
4543 {
4544 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4545 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4546 {
4547 pr64Dst->s64.fSign = pr80Src->s.fSign;
4548 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4549 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4550 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4551 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4552 fFsw |= X86_FSW_IE;
4553 }
4554 else
4555 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4556 }
4557 else
4558 {
4559 /* Denormal values causes both an underflow and precision exception. */
4560 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4561 if (fFcw & X86_FCW_UM)
4562 {
4563 pr64Dst->s64.fSign = pr80Src->s.fSign;
4564 pr64Dst->s64.uExponent = 0;
4565 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4566 {
4567 pr64Dst->s64.uFraction = 1;
4568 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4569 if (!(fFcw & X86_FCW_PM))
4570 fFsw |= X86_FSW_ES | X86_FSW_B;
4571 }
4572 else
4573 {
4574 pr64Dst->s64.uFraction = 0;
4575 fFsw |= X86_FSW_UE | X86_FSW_PE;
4576 if (!(fFcw & X86_FCW_PM))
4577 fFsw |= X86_FSW_ES | X86_FSW_B;
4578 }
4579 }
4580 else
4581 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4582 }
4583 *pu16FSW = fFsw;
4584}
4585
4586
4587IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4588 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4589{
4590 /*
4591 * FPU status word:
4592 * - TOP is irrelevant, but we must match x86 assembly version (0).
4593 * - C1 is always cleared as we don't have any stack overflows.
4594 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4595 */
4596 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4597 *pr80Dst = *pr80Src;
4598}
4599
4600
4601/*
4602 *
4603 * Mantissa:
4604 * 63 56 48 40 32 24 16 8 0
4605 * v v v v v v v v v
4606 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4607 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4608 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4609 *
4610 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4611 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4612 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4613 * where we'll drop off all but bit 63.
4614 */
4615#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4616IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4617 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4618{ \
4619 uint16_t const fFcw = pFpuState->FCW; \
4620 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4621 bool const fSignIn = pr80Val->s.fSign; \
4622 \
4623 /* \
4624 * Deal with normal numbers first. \
4625 */ \
4626 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4627 { \
4628 uint64_t uMantissa = pr80Val->s.uMantissa; \
4629 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4630 \
4631 if ((uint32_t)iExponent <= a_cBits - 2) \
4632 { \
4633 unsigned const cShiftOff = 63 - iExponent; \
4634 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4635 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4636 ? RT_BIT_64(cShiftOff - 1) \
4637 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4638 ? fRoundingOffMask \
4639 : 0; \
4640 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4641 \
4642 uMantissa >>= cShiftOff; \
4643 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4644 uMantissa += uRounding; \
4645 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4646 { \
4647 if (fRoundedOff) \
4648 { \
4649 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4650 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4651 else if (uRounding) \
4652 fFsw |= X86_FSW_C1; \
4653 fFsw |= X86_FSW_PE; \
4654 if (!(fFcw & X86_FCW_PM)) \
4655 fFsw |= X86_FSW_ES | X86_FSW_B; \
4656 } \
4657 \
4658 if (!fSignIn) \
4659 *piDst = (a_iType)uMantissa; \
4660 else \
4661 *piDst = -(a_iType)uMantissa; \
4662 } \
4663 else \
4664 { \
4665 /* overflowed after rounding. */ \
4666 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4667 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4668 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4669 \
4670 /* Special case for the integer minimum value. */ \
4671 if (fSignIn) \
4672 { \
4673 *piDst = a_iTypeMin; \
4674 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4675 if (!(fFcw & X86_FCW_PM)) \
4676 fFsw |= X86_FSW_ES | X86_FSW_B; \
4677 } \
4678 else \
4679 { \
4680 fFsw |= X86_FSW_IE; \
4681 if (fFcw & X86_FCW_IM) \
4682 *piDst = a_iTypeMin; \
4683 else \
4684 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4685 } \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 if (!fSignIn) \
4694 { \
4695 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4696 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4697 { \
4698 *piDst = 1; \
4699 fFsw |= X86_FSW_C1; \
4700 } \
4701 else \
4702 *piDst = 0; \
4703 } \
4704 else \
4705 { \
4706 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4707 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4708 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4709 *piDst = 0; \
4710 else \
4711 { \
4712 *piDst = -1; \
4713 fFsw |= X86_FSW_C1; \
4714 } \
4715 } \
4716 fFsw |= X86_FSW_PE; \
4717 if (!(fFcw & X86_FCW_PM)) \
4718 fFsw |= X86_FSW_ES | X86_FSW_B; \
4719 } \
4720 /* \
4721 * Special MIN case. \
4722 */ \
4723 else if ( fSignIn && iExponent == a_cBits - 1 \
4724 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4725 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4726 : uMantissa == RT_BIT_64(63))) \
4727 { \
4728 *piDst = a_iTypeMin; \
4729 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4730 { \
4731 fFsw |= X86_FSW_PE; \
4732 if (!(fFcw & X86_FCW_PM)) \
4733 fFsw |= X86_FSW_ES | X86_FSW_B; \
4734 } \
4735 } \
4736 /* \
4737 * Too large/small number outside the target integer range. \
4738 */ \
4739 else \
4740 { \
4741 fFsw |= X86_FSW_IE; \
4742 if (fFcw & X86_FCW_IM) \
4743 *piDst = a_iTypeIndefinite; \
4744 else \
4745 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4746 } \
4747 } \
4748 /* \
4749 * Map both +0 and -0 to integer zero (signless/+). \
4750 */ \
4751 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4752 *piDst = 0; \
4753 /* \
4754 * Denormals are just really tiny sub-zero numbers that are either rounded \
4755 * to zero, 1 or -1 depending on sign and rounding control. \
4756 */ \
4757 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4758 { \
4759 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4760 *piDst = 0; \
4761 else \
4762 { \
4763 *piDst = fSignIn ? -1 : 1; \
4764 fFsw |= X86_FSW_C1; \
4765 } \
4766 fFsw |= X86_FSW_PE; \
4767 if (!(fFcw & X86_FCW_PM)) \
4768 fFsw |= X86_FSW_ES | X86_FSW_B; \
4769 } \
4770 /* \
4771 * All other special values are considered invalid arguments and result \
4772 * in an IE exception and indefinite value if masked. \
4773 */ \
4774 else \
4775 { \
4776 fFsw |= X86_FSW_IE; \
4777 if (fFcw & X86_FCW_IM) \
4778 *piDst = a_iTypeIndefinite; \
4779 else \
4780 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4781 } \
4782 *pu16FSW = fFsw; \
4783}
4784EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4785EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4786EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4787
4788#endif /*IEM_WITHOUT_ASSEMBLY */
4789
4790
4791/*
4792 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4793 *
4794 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4795 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4796 * thus the @a a_cBitsIn.
4797 */
4798#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4799IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4800 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4801{ \
4802 uint16_t const fFcw = pFpuState->FCW; \
4803 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4804 bool const fSignIn = pr80Val->s.fSign; \
4805 \
4806 /* \
4807 * Deal with normal numbers first. \
4808 */ \
4809 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4810 { \
4811 uint64_t uMantissa = pr80Val->s.uMantissa; \
4812 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4813 \
4814 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4815 { \
4816 unsigned const cShiftOff = 63 - iExponent; \
4817 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4818 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4819 uMantissa >>= cShiftOff; \
4820 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4821 if (!fSignIn) \
4822 *piDst = (a_iType)uMantissa; \
4823 else \
4824 *piDst = -(a_iType)uMantissa; \
4825 \
4826 if (fRoundedOff) \
4827 { \
4828 fFsw |= X86_FSW_PE; \
4829 if (!(fFcw & X86_FCW_PM)) \
4830 fFsw |= X86_FSW_ES | X86_FSW_B; \
4831 } \
4832 } \
4833 /* \
4834 * Tiny sub-zero numbers. \
4835 */ \
4836 else if (iExponent < 0) \
4837 { \
4838 *piDst = 0; \
4839 fFsw |= X86_FSW_PE; \
4840 if (!(fFcw & X86_FCW_PM)) \
4841 fFsw |= X86_FSW_ES | X86_FSW_B; \
4842 } \
4843 /* \
4844 * Special MIN case. \
4845 */ \
4846 else if ( fSignIn && iExponent == a_cBits - 1 \
4847 && (a_cBits < 64 \
4848 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4849 : uMantissa == RT_BIT_64(63)) ) \
4850 { \
4851 *piDst = a_iTypeMin; \
4852 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4853 { \
4854 fFsw |= X86_FSW_PE; \
4855 if (!(fFcw & X86_FCW_PM)) \
4856 fFsw |= X86_FSW_ES | X86_FSW_B; \
4857 } \
4858 } \
4859 /* \
4860 * Figure this weirdness. \
4861 */ \
4862 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4863 { \
4864 *piDst = 0; \
4865 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4866 { \
4867 fFsw |= X86_FSW_PE; \
4868 if (!(fFcw & X86_FCW_PM)) \
4869 fFsw |= X86_FSW_ES | X86_FSW_B; \
4870 } \
4871 } \
4872 /* \
4873 * Too large/small number outside the target integer range. \
4874 */ \
4875 else \
4876 { \
4877 fFsw |= X86_FSW_IE; \
4878 if (fFcw & X86_FCW_IM) \
4879 *piDst = a_iTypeIndefinite; \
4880 else \
4881 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4882 } \
4883 } \
4884 /* \
4885 * Map both +0 and -0 to integer zero (signless/+). \
4886 */ \
4887 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4888 *piDst = 0; \
4889 /* \
4890 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4891 */ \
4892 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4893 { \
4894 *piDst = 0; \
4895 fFsw |= X86_FSW_PE; \
4896 if (!(fFcw & X86_FCW_PM)) \
4897 fFsw |= X86_FSW_ES | X86_FSW_B; \
4898 } \
4899 /* \
4900 * All other special values are considered invalid arguments and result \
4901 * in an IE exception and indefinite value if masked. \
4902 */ \
4903 else \
4904 { \
4905 fFsw |= X86_FSW_IE; \
4906 if (fFcw & X86_FCW_IM) \
4907 *piDst = a_iTypeIndefinite; \
4908 else \
4909 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4910 } \
4911 *pu16FSW = fFsw; \
4912}
4913#if defined(IEM_WITHOUT_ASSEMBLY)
4914EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4915EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4916EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4917#endif
4918EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4919EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4920
4921
4922#if defined(IEM_WITHOUT_ASSEMBLY)
4923
4924IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4925 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4926{
4927 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4928 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4929 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4930 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4931 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4932
4933 uint16_t const fFcw = pFpuState->FCW;
4934 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4935 bool const fSignIn = pr80Src->s.fSign;
4936
4937 /*
4938 * Deal with normal numbers first.
4939 */
4940 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4941 {
4942 uint64_t uMantissa = pr80Src->s.uMantissa;
4943 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4944 if ( (uint32_t)iExponent <= 58
4945 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4946 {
4947 unsigned const cShiftOff = 63 - iExponent;
4948 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4949 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4950 ? RT_BIT_64(cShiftOff - 1)
4951 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4952 ? fRoundingOffMask
4953 : 0;
4954 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4955
4956 uMantissa >>= cShiftOff;
4957 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4958 uMantissa += uRounding;
4959 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4960 {
4961 if (fRoundedOff)
4962 {
4963 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4964 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4965 else if (uRounding)
4966 fFsw |= X86_FSW_C1;
4967 fFsw |= X86_FSW_PE;
4968 if (!(fFcw & X86_FCW_PM))
4969 fFsw |= X86_FSW_ES | X86_FSW_B;
4970 }
4971
4972 pd80Dst->s.fSign = fSignIn;
4973 pd80Dst->s.uPad = 0;
4974 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4975 {
4976 unsigned const uDigits = uMantissa % 100;
4977 uMantissa /= 100;
4978 uint8_t const bLo = uDigits % 10;
4979 uint8_t const bHi = uDigits / 10;
4980 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4981 }
4982 }
4983 else
4984 {
4985 /* overflowed after rounding. */
4986 fFsw |= X86_FSW_IE;
4987 if (fFcw & X86_FCW_IM)
4988 *pd80Dst = s_d80Indefinite;
4989 else
4990 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4991 }
4992 }
4993 /*
4994 * Tiny sub-zero numbers.
4995 */
4996 else if (iExponent < 0)
4997 {
4998 if (!fSignIn)
4999 {
5000 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5001 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5002 {
5003 *pd80Dst = s_ad80One[fSignIn];
5004 fFsw |= X86_FSW_C1;
5005 }
5006 else
5007 *pd80Dst = s_ad80Zeros[fSignIn];
5008 }
5009 else
5010 {
5011 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5012 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5013 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5014 *pd80Dst = s_ad80Zeros[fSignIn];
5015 else
5016 {
5017 *pd80Dst = s_ad80One[fSignIn];
5018 fFsw |= X86_FSW_C1;
5019 }
5020 }
5021 fFsw |= X86_FSW_PE;
5022 if (!(fFcw & X86_FCW_PM))
5023 fFsw |= X86_FSW_ES | X86_FSW_B;
5024 }
5025 /*
5026 * Too large/small number outside the target integer range.
5027 */
5028 else
5029 {
5030 fFsw |= X86_FSW_IE;
5031 if (fFcw & X86_FCW_IM)
5032 *pd80Dst = s_d80Indefinite;
5033 else
5034 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5035 }
5036 }
5037 /*
5038 * Map both +0 and -0 to integer zero (signless/+).
5039 */
5040 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5041 *pd80Dst = s_ad80Zeros[fSignIn];
5042 /*
5043 * Denormals are just really tiny sub-zero numbers that are either rounded
5044 * to zero, 1 or -1 depending on sign and rounding control.
5045 */
5046 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5047 {
5048 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5049 *pd80Dst = s_ad80Zeros[fSignIn];
5050 else
5051 {
5052 *pd80Dst = s_ad80One[fSignIn];
5053 fFsw |= X86_FSW_C1;
5054 }
5055 fFsw |= X86_FSW_PE;
5056 if (!(fFcw & X86_FCW_PM))
5057 fFsw |= X86_FSW_ES | X86_FSW_B;
5058 }
5059 /*
5060 * All other special values are considered invalid arguments and result
5061 * in an IE exception and indefinite value if masked.
5062 */
5063 else
5064 {
5065 fFsw |= X86_FSW_IE;
5066 if (fFcw & X86_FCW_IM)
5067 *pd80Dst = s_d80Indefinite;
5068 else
5069 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5070 }
5071 *pu16FSW = fFsw;
5072}
5073
5074
5075/*********************************************************************************************************************************
5076* FPU Helpers *
5077*********************************************************************************************************************************/
5078AssertCompileSize(RTFLOAT128U, 16);
5079AssertCompileSize(RTFLOAT80U, 10);
5080AssertCompileSize(RTFLOAT64U, 8);
5081AssertCompileSize(RTFLOAT32U, 4);
5082
5083/**
5084 * Normalizes a possible pseudo-normal value.
5085 *
5086 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5087 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5088 * i.e. changing uExponent from 0 to 1.
5089 *
5090 * This macro will declare a RTFLOAT80U with the name given by
5091 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5092 * a normalization was performed.
5093 *
5094 * @note This must be applied before calling SoftFloat with a value that couldbe
5095 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5096 * correctly.
5097 */
5098#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5099 RTFLOAT80U a_r80ValNormalized; \
5100 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5101 { \
5102 a_r80ValNormalized = *a_pr80Val; \
5103 a_r80ValNormalized.s.uExponent = 1; \
5104 a_pr80Val = &a_r80ValNormalized; \
5105 } else do {} while (0)
5106
5107#ifdef IEM_WITH_FLOAT128_FOR_FPU
5108
5109DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5110{
5111 int fNew;
5112 switch (fFcw & X86_FCW_RC_MASK)
5113 {
5114 default:
5115 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5116 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5117 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5118 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5119 }
5120 int fOld = fegetround();
5121 fesetround(fNew);
5122 return fOld;
5123}
5124
5125
5126DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5127{
5128 fesetround(fOld);
5129}
5130
5131DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5132{
5133 RT_NOREF(fFcw);
5134 RTFLOAT128U Tmp;
5135 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5136 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5137 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5138 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5139 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5140 {
5141 Assert(Tmp.s.uExponent == 0);
5142 Tmp.s2.uSignAndExponent++;
5143 }
5144 return *(_Float128 *)&Tmp;
5145}
5146
5147
5148DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5149{
5150 RT_NOREF(fFcw);
5151 RTFLOAT128U Tmp;
5152 *(_Float128 *)&Tmp = rd128ValSrc;
5153 ASMCompilerBarrier();
5154 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5155 {
5156 pr80Dst->s.fSign = Tmp.s64.fSign;
5157 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5158 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5159 | Tmp.s64.uFractionLo >> (64 - 15);
5160
5161 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5162 unsigned const cShiftOff = 64 - 15;
5163 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5164 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5165 if (uRoundedOff)
5166 {
5167 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5168 ? RT_BIT_64(cShiftOff - 1)
5169 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5170 ? fRoundingOffMask
5171 : 0;
5172 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5173 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5174 || uRoundedOff != uRoundingAdd)
5175 {
5176 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5177 {
5178 uFraction += 1;
5179 if (!(uFraction & RT_BIT_64(63)))
5180 { /* likely */ }
5181 else
5182 {
5183 uFraction >>= 1;
5184 pr80Dst->s.uExponent++;
5185 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5186 return fFsw;
5187 }
5188 fFsw |= X86_FSW_C1;
5189 }
5190 }
5191 fFsw |= X86_FSW_PE;
5192 if (!(fFcw & X86_FCW_PM))
5193 fFsw |= X86_FSW_ES | X86_FSW_B;
5194 }
5195 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5196 }
5197 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5198 {
5199 pr80Dst->s.fSign = Tmp.s64.fSign;
5200 pr80Dst->s.uExponent = 0;
5201 pr80Dst->s.uMantissa = 0;
5202 }
5203 else if (RTFLOAT128U_IS_INF(&Tmp))
5204 {
5205 pr80Dst->s.fSign = Tmp.s64.fSign;
5206 pr80Dst->s.uExponent = 0;
5207 pr80Dst->s.uMantissa = 0;
5208 }
5209 return fFsw;
5210}
5211
5212
5213#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5214
5215/** Initializer for the SoftFloat state structure. */
5216# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5217 { \
5218 softfloat_tininess_afterRounding, \
5219 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5220 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5221 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5222 : (uint8_t)softfloat_round_minMag, \
5223 0, \
5224 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5225 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5226 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5227 }
5228
5229/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5230# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5231 ( (a_fFsw) \
5232 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5233 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5234 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5235 ? X86_FSW_ES | X86_FSW_B : 0) )
5236
5237
5238DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5239{
5240 RT_NOREF(fFcw);
5241 Assert(cBits > 64);
5242# if 0 /* rounding does not seem to help */
5243 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5244 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5245 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5246 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5247 {
5248 uint64_t uOld = r128.v[0];
5249 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5250 if (r128.v[0] < uOld)
5251 r128.v[1] += 1;
5252 }
5253# else
5254 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5255# endif
5256 return r128;
5257}
5258
5259
5260DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5261{
5262 RT_NOREF(fFcw);
5263 Assert(cBits > 64);
5264# if 0 /* rounding does not seem to help, not even on constants */
5265 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5266 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5267 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5268 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5269 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5270 {
5271 uint64_t uOld = r128.v[0];
5272 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5273 if (r128.v[0] < uOld)
5274 r128.v[1] += 1;
5275 }
5276 return r128;
5277# else
5278 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5279 return r128;
5280# endif
5281}
5282
5283
5284# if 0 /* unused */
5285DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5286{
5287 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5288 return r128;
5289}
5290# endif
5291
5292
5293/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5294DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5295{
5296 extFloat80_t Tmp;
5297 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5298 Tmp.signif = pr80Val->s2.uMantissa;
5299 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5300 return extF80_to_f128(Tmp, &Ignored);
5301}
5302
5303
5304/**
5305 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5306 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5307 *
5308 * This is only a structure format conversion, nothing else.
5309 */
5310DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5311{
5312 extFloat80_t Tmp;
5313 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5314 Tmp.signif = pr80Val->s2.uMantissa;
5315 return Tmp;
5316}
5317
5318
5319/**
5320 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5321 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5322 *
5323 * This is only a structure format conversion, nothing else.
5324 */
5325DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5326{
5327 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5328 pr80Dst->s2.uMantissa = r80XSrc.signif;
5329 return pr80Dst;
5330}
5331
5332
5333DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5334{
5335 RT_NOREF(fFcw);
5336 RTFLOAT128U Tmp;
5337 *(float128_t *)&Tmp = r128Src;
5338 ASMCompilerBarrier();
5339
5340 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5341 {
5342 pr80Dst->s.fSign = Tmp.s64.fSign;
5343 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5344 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5345 | Tmp.s64.uFractionLo >> (64 - 15);
5346
5347 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5348 unsigned const cShiftOff = 64 - 15;
5349 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5350 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5351 if (uRoundedOff)
5352 {
5353 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5354 ? RT_BIT_64(cShiftOff - 1)
5355 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5356 ? fRoundingOffMask
5357 : 0;
5358 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5359 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5360 || uRoundedOff != uRoundingAdd)
5361 {
5362 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5363 {
5364 uFraction += 1;
5365 if (!(uFraction & RT_BIT_64(63)))
5366 { /* likely */ }
5367 else
5368 {
5369 uFraction >>= 1;
5370 pr80Dst->s.uExponent++;
5371 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5372 return fFsw;
5373 }
5374 fFsw |= X86_FSW_C1;
5375 }
5376 }
5377 fFsw |= X86_FSW_PE;
5378 if (!(fFcw & X86_FCW_PM))
5379 fFsw |= X86_FSW_ES | X86_FSW_B;
5380 }
5381
5382 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5383 }
5384 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5385 {
5386 pr80Dst->s.fSign = Tmp.s64.fSign;
5387 pr80Dst->s.uExponent = 0;
5388 pr80Dst->s.uMantissa = 0;
5389 }
5390 else if (RTFLOAT128U_IS_INF(&Tmp))
5391 {
5392 pr80Dst->s.fSign = Tmp.s64.fSign;
5393 pr80Dst->s.uExponent = 0x7fff;
5394 pr80Dst->s.uMantissa = 0;
5395 }
5396 return fFsw;
5397}
5398
5399
5400/**
5401 * Helper for transfering exception and C1 to FSW and setting the result value
5402 * accordingly.
5403 *
5404 * @returns Updated FSW.
5405 * @param pSoftState The SoftFloat state following the operation.
5406 * @param r80XResult The result of the SoftFloat operation.
5407 * @param pr80Result Where to store the result for IEM.
5408 * @param fFcw The FPU control word.
5409 * @param fFsw The FSW before the operation, with necessary bits
5410 * cleared and such.
5411 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5412 * raised.
5413 */
5414DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5415 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5416 PCRTFLOAT80U pr80XcptResult)
5417{
5418 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5419 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5420 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5421 fFsw |= X86_FSW_ES | X86_FSW_B;
5422
5423 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5424 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5425 else
5426 {
5427 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5428 *pr80Result = *pr80XcptResult;
5429 }
5430 return fFsw;
5431}
5432
5433
5434/**
5435 * Helper doing polynomial evaluation using Horner's method.
5436 *
5437 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5438 */
5439float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5440 unsigned cPrecision, softfloat_state_t *pSoftState)
5441{
5442 Assert(cHornerConsts > 1);
5443 size_t i = cHornerConsts - 1;
5444 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5445 while (i-- > 0)
5446 {
5447 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5448 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5449 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5450 }
5451 return r128Result;
5452}
5453
5454#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5455
5456
5457/**
5458 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5459 * mantissa, exponent and sign.
5460 *
5461 * @returns Updated FSW.
5462 * @param pr80Dst Where to return the composed value.
5463 * @param fSign The sign.
5464 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5465 * ignored and should be zero. This will probably be
5466 * modified during normalization and rounding.
5467 * @param iExponent Unbiased exponent.
5468 * @param fFcw The FPU control word.
5469 * @param fFsw The FPU status word.
5470 */
5471static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5472 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5473{
5474 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5475
5476 iExponent += RTFLOAT80U_EXP_BIAS;
5477
5478 /* Do normalization if necessary and possible. */
5479 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5480 {
5481 int cShift = 192 - RTUInt256BitCount(puMantissa);
5482 if (iExponent > cShift)
5483 iExponent -= cShift;
5484 else
5485 {
5486 if (fFcw & X86_FCW_UM)
5487 {
5488 if (iExponent > 0)
5489 cShift = --iExponent;
5490 else
5491 cShift = 0;
5492 }
5493 iExponent -= cShift;
5494 }
5495 RTUInt256AssignShiftLeft(puMantissa, cShift);
5496 }
5497
5498 /* Do rounding. */
5499 uint64_t uMantissa = puMantissa->QWords.qw2;
5500 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5501 {
5502 bool fAdd;
5503 switch (fFcw & X86_FCW_RC_MASK)
5504 {
5505 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5506 case X86_FCW_RC_NEAREST:
5507 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5508 {
5509 if ( (uMantissa & 1)
5510 || puMantissa->QWords.qw0 != 0
5511 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5512 {
5513 fAdd = true;
5514 break;
5515 }
5516 uMantissa &= ~(uint64_t)1;
5517 }
5518 fAdd = false;
5519 break;
5520 case X86_FCW_RC_ZERO:
5521 fAdd = false;
5522 break;
5523 case X86_FCW_RC_UP:
5524 fAdd = !fSign;
5525 break;
5526 case X86_FCW_RC_DOWN:
5527 fAdd = fSign;
5528 break;
5529 }
5530 if (fAdd)
5531 {
5532 uint64_t const uTmp = uMantissa;
5533 uMantissa = uTmp + 1;
5534 if (uMantissa < uTmp)
5535 {
5536 uMantissa >>= 1;
5537 uMantissa |= RT_BIT_64(63);
5538 iExponent++;
5539 }
5540 fFsw |= X86_FSW_C1;
5541 }
5542 fFsw |= X86_FSW_PE;
5543 if (!(fFcw & X86_FCW_PM))
5544 fFsw |= X86_FSW_ES | X86_FSW_B;
5545 }
5546
5547 /* Check for underflow (denormals). */
5548 if (iExponent <= 0)
5549 {
5550 if (fFcw & X86_FCW_UM)
5551 {
5552 if (uMantissa & RT_BIT_64(63))
5553 uMantissa >>= 1;
5554 iExponent = 0;
5555 }
5556 else
5557 {
5558 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5559 fFsw |= X86_FSW_ES | X86_FSW_B;
5560 }
5561 fFsw |= X86_FSW_UE;
5562 }
5563 /* Check for overflow */
5564 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5565 {
5566 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5567 }
5568
5569 /* Compose the result. */
5570 pr80Dst->s.uMantissa = uMantissa;
5571 pr80Dst->s.uExponent = iExponent;
5572 pr80Dst->s.fSign = fSign;
5573 return fFsw;
5574}
5575
5576
5577/**
5578 * See also iemAImpl_fld_r80_from_r32
5579 */
5580static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5581{
5582 uint16_t fFsw = 0;
5583 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5584 {
5585 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5586 pr80Dst->sj64.fInteger = 1;
5587 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5588 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5589 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5590 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5591 }
5592 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5593 {
5594 pr80Dst->s.fSign = pr32Val->s.fSign;
5595 pr80Dst->s.uExponent = 0;
5596 pr80Dst->s.uMantissa = 0;
5597 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5598 }
5599 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5600 {
5601 /* Subnormal -> normalized + X86_FSW_DE return. */
5602 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5603 pr80Dst->sj64.fInteger = 1;
5604 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5605 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5606 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5607 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5608 fFsw = X86_FSW_DE;
5609 }
5610 else if (RTFLOAT32U_IS_INF(pr32Val))
5611 {
5612 pr80Dst->s.fSign = pr32Val->s.fSign;
5613 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5614 pr80Dst->s.uMantissa = RT_BIT_64(63);
5615 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5616 }
5617 else
5618 {
5619 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5620 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5621 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5622 pr80Dst->sj64.fInteger = 1;
5623 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5624 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5625 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5626 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5627 }
5628 return fFsw;
5629}
5630
5631
5632/**
5633 * See also iemAImpl_fld_r80_from_r64
5634 */
5635static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5636{
5637 uint16_t fFsw = 0;
5638 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5639 {
5640 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5641 pr80Dst->sj64.fInteger = 1;
5642 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5643 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5644 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5645 }
5646 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5647 {
5648 pr80Dst->s.fSign = pr64Val->s.fSign;
5649 pr80Dst->s.uExponent = 0;
5650 pr80Dst->s.uMantissa = 0;
5651 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5652 }
5653 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5654 {
5655 /* Subnormal values gets normalized. */
5656 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5657 pr80Dst->sj64.fInteger = 1;
5658 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5659 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5660 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5661 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5662 fFsw = X86_FSW_DE;
5663 }
5664 else if (RTFLOAT64U_IS_INF(pr64Val))
5665 {
5666 pr80Dst->s.fSign = pr64Val->s.fSign;
5667 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5668 pr80Dst->s.uMantissa = RT_BIT_64(63);
5669 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5670 }
5671 else
5672 {
5673 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5674 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5675 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5676 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5677 pr80Dst->sj64.fInteger = 1;
5678 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5679 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5680 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5681 }
5682 return fFsw;
5683}
5684
5685
5686/**
5687 * See also EMIT_FILD.
5688 */
5689#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5690static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5691{ \
5692 if (iVal == 0) \
5693 { \
5694 pr80Dst->s.fSign = 0; \
5695 pr80Dst->s.uExponent = 0; \
5696 pr80Dst->s.uMantissa = 0; \
5697 } \
5698 else \
5699 { \
5700 if (iVal > 0) \
5701 pr80Dst->s.fSign = 0; \
5702 else \
5703 { \
5704 pr80Dst->s.fSign = 1; \
5705 iVal = -iVal; \
5706 } \
5707 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5708 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5709 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5710 } \
5711 return pr80Dst; \
5712}
5713EMIT_CONVERT_IXX_TO_R80(16)
5714EMIT_CONVERT_IXX_TO_R80(32)
5715//EMIT_CONVERT_IXX_TO_R80(64)
5716
5717/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5718#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5719IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5720{ \
5721 RTFLOAT80U r80Val2; \
5722 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5723 Assert(!fFsw || fFsw == X86_FSW_DE); \
5724 if (fFsw) \
5725 { \
5726 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5727 fFsw = 0; \
5728 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5729 { \
5730 pFpuRes->r80Result = *pr80Val1; \
5731 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5732 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5733 return; \
5734 } \
5735 } \
5736 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5737 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5738}
5739
5740/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5741#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5742IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5743{ \
5744 RTFLOAT80U r80Val2; \
5745 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5746 Assert(!fFsw || fFsw == X86_FSW_DE); \
5747 if (fFsw) \
5748 { \
5749 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5750 fFsw = 0; \
5751 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5752 { \
5753 pFpuRes->r80Result = *pr80Val1; \
5754 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5755 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5756 return; \
5757 } \
5758 } \
5759 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5760 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5761}
5762
5763/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5764#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5765IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5766{ \
5767 RTFLOAT80U r80Val2; \
5768 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5769 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5770}
5771
5772/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5773#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5774IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5775{ \
5776 RTFLOAT80U r80Val2; \
5777 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5778 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5779}
5780
5781
5782
5783/*********************************************************************************************************************************
5784* x86 FPU Division Operations *
5785*********************************************************************************************************************************/
5786
5787/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5788static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5789 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5790{
5791 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5792 {
5793 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5794 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5795 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5796 }
5797 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5798 { /* Div by zero. */
5799 if (fFcw & X86_FCW_ZM)
5800 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5801 else
5802 {
5803 *pr80Result = *pr80Val1Org;
5804 fFsw |= X86_FSW_ES | X86_FSW_B;
5805 }
5806 fFsw |= X86_FSW_ZE;
5807 }
5808 else
5809 { /* Invalid operand */
5810 if (fFcw & X86_FCW_IM)
5811 *pr80Result = g_r80Indefinite;
5812 else
5813 {
5814 *pr80Result = *pr80Val1Org;
5815 fFsw |= X86_FSW_ES | X86_FSW_B;
5816 }
5817 fFsw |= X86_FSW_IE;
5818 }
5819 return fFsw;
5820}
5821
5822
5823IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5824 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5825{
5826 uint16_t const fFcw = pFpuState->FCW;
5827 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5828
5829 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5830 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5831 {
5832 if (fFcw & X86_FCW_IM)
5833 pFpuRes->r80Result = g_r80Indefinite;
5834 else
5835 {
5836 pFpuRes->r80Result = *pr80Val1;
5837 fFsw |= X86_FSW_ES | X86_FSW_B;
5838 }
5839 fFsw |= X86_FSW_IE;
5840 }
5841 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5842 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5843 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5844 {
5845 if (fFcw & X86_FCW_DM)
5846 {
5847 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5848 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5849 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5850 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5851 }
5852 else
5853 {
5854 pFpuRes->r80Result = *pr80Val1;
5855 fFsw |= X86_FSW_ES | X86_FSW_B;
5856 }
5857 fFsw |= X86_FSW_DE;
5858 }
5859 /* SoftFloat can handle the rest: */
5860 else
5861 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5862
5863 pFpuRes->FSW = fFsw;
5864}
5865
5866
5867EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5868EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5869EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5870EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5871
5872
5873IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5874 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5875{
5876 uint16_t const fFcw = pFpuState->FCW;
5877 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5878
5879 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5880 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5881 {
5882 if (fFcw & X86_FCW_IM)
5883 pFpuRes->r80Result = g_r80Indefinite;
5884 else
5885 {
5886 pFpuRes->r80Result = *pr80Val1;
5887 fFsw |= X86_FSW_ES | X86_FSW_B;
5888 }
5889 fFsw |= X86_FSW_IE;
5890 }
5891 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5892 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5893 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5894 {
5895 if (fFcw & X86_FCW_DM)
5896 {
5897 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5898 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5899 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5900 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5901 }
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_DE;
5908 }
5909 /* SoftFloat can handle the rest: */
5910 else
5911 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5912
5913 pFpuRes->FSW = fFsw;
5914}
5915
5916
5917EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5918EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5919EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5920EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5921
5922
5923/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5924static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5925 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5926{
5927 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5928 {
5929 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5930 uint16_t fCxFlags = 0;
5931 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5932 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5933 &fCxFlags, &SoftState);
5934 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5935 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5936 if ( !(fFsw & X86_FSW_IE)
5937 && !RTFLOAT80U_IS_NAN(pr80Result)
5938 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5939 {
5940 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5941 fFsw |= fCxFlags & X86_FSW_C_MASK;
5942 }
5943 return fFsw;
5944 }
5945
5946 /* Invalid operand */
5947 if (fFcw & X86_FCW_IM)
5948 *pr80Result = g_r80Indefinite;
5949 else
5950 {
5951 *pr80Result = *pr80Val1Org;
5952 fFsw |= X86_FSW_ES | X86_FSW_B;
5953 }
5954 return fFsw | X86_FSW_IE;
5955}
5956
5957
5958static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5959 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5960{
5961 uint16_t const fFcw = pFpuState->FCW;
5962 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5963
5964 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5965 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5966 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5967 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5968 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5969 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5970 {
5971 if (fFcw & X86_FCW_IM)
5972 pFpuRes->r80Result = g_r80Indefinite;
5973 else
5974 {
5975 pFpuRes->r80Result = *pr80Val1;
5976 fFsw |= X86_FSW_ES | X86_FSW_B;
5977 }
5978 fFsw |= X86_FSW_IE;
5979 }
5980 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5981 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5982 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5983 {
5984 if (fFcw & X86_FCW_DM)
5985 {
5986 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5987 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5988 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5989 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5990 pr80Val1Org, fLegacyInstr);
5991 }
5992 else
5993 {
5994 pFpuRes->r80Result = *pr80Val1;
5995 fFsw |= X86_FSW_ES | X86_FSW_B;
5996 }
5997 fFsw |= X86_FSW_DE;
5998 }
5999 /* SoftFloat can handle the rest: */
6000 else
6001 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6002 pr80Val1, fLegacyInstr);
6003
6004 pFpuRes->FSW = fFsw;
6005}
6006
6007
6008IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6009 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6010{
6011 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6012}
6013
6014
6015IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6016 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6017{
6018 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6019}
6020
6021
6022/*********************************************************************************************************************************
6023* x87 FPU Multiplication Operations *
6024*********************************************************************************************************************************/
6025
6026/** Worker for iemAImpl_fmul_r80_by_r80. */
6027static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6028 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6029{
6030 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6031 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6032 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6033}
6034
6035
6036IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6037 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6038{
6039 uint16_t const fFcw = pFpuState->FCW;
6040 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6041
6042 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6043 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6044 {
6045 if (fFcw & X86_FCW_IM)
6046 pFpuRes->r80Result = g_r80Indefinite;
6047 else
6048 {
6049 pFpuRes->r80Result = *pr80Val1;
6050 fFsw |= X86_FSW_ES | X86_FSW_B;
6051 }
6052 fFsw |= X86_FSW_IE;
6053 }
6054 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6055 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6056 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6057 {
6058 if (fFcw & X86_FCW_DM)
6059 {
6060 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6061 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6062 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6063 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6064 }
6065 else
6066 {
6067 pFpuRes->r80Result = *pr80Val1;
6068 fFsw |= X86_FSW_ES | X86_FSW_B;
6069 }
6070 fFsw |= X86_FSW_DE;
6071 }
6072 /* SoftFloat can handle the rest: */
6073 else
6074 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6075
6076 pFpuRes->FSW = fFsw;
6077}
6078
6079
6080EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6081EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6082EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6083EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6084
6085
6086/*********************************************************************************************************************************
6087* x87 FPU Addition *
6088*********************************************************************************************************************************/
6089
6090/** Worker for iemAImpl_fadd_r80_by_r80. */
6091static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6092 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6093{
6094 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6095 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6096 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6097}
6098
6099
6100IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6101 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6102{
6103 uint16_t const fFcw = pFpuState->FCW;
6104 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6105
6106 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6107 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6108 {
6109 if (fFcw & X86_FCW_IM)
6110 pFpuRes->r80Result = g_r80Indefinite;
6111 else
6112 {
6113 pFpuRes->r80Result = *pr80Val1;
6114 fFsw |= X86_FSW_ES | X86_FSW_B;
6115 }
6116 fFsw |= X86_FSW_IE;
6117 }
6118 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6119 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6120 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6121 {
6122 if (fFcw & X86_FCW_DM)
6123 {
6124 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6125 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6126 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6127 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6128 }
6129 else
6130 {
6131 pFpuRes->r80Result = *pr80Val1;
6132 fFsw |= X86_FSW_ES | X86_FSW_B;
6133 }
6134 fFsw |= X86_FSW_DE;
6135 }
6136 /* SoftFloat can handle the rest: */
6137 else
6138 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6139
6140 pFpuRes->FSW = fFsw;
6141}
6142
6143
6144EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6145EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6146EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6147EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6148
6149
6150/*********************************************************************************************************************************
6151* x87 FPU Subtraction *
6152*********************************************************************************************************************************/
6153
6154/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6155static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6156 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6157{
6158 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6159 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6160 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6161}
6162
6163
6164IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6165 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6166{
6167 uint16_t const fFcw = pFpuState->FCW;
6168 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6169
6170 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6171 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6172 {
6173 if (fFcw & X86_FCW_IM)
6174 pFpuRes->r80Result = g_r80Indefinite;
6175 else
6176 {
6177 pFpuRes->r80Result = *pr80Val1;
6178 fFsw |= X86_FSW_ES | X86_FSW_B;
6179 }
6180 fFsw |= X86_FSW_IE;
6181 }
6182 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6183 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6184 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6185 {
6186 if (fFcw & X86_FCW_DM)
6187 {
6188 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6189 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6190 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6191 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6192 }
6193 else
6194 {
6195 pFpuRes->r80Result = *pr80Val1;
6196 fFsw |= X86_FSW_ES | X86_FSW_B;
6197 }
6198 fFsw |= X86_FSW_DE;
6199 }
6200 /* SoftFloat can handle the rest: */
6201 else
6202 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6203
6204 pFpuRes->FSW = fFsw;
6205}
6206
6207
6208EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6209EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6210EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6211EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6212
6213
6214/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6215IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6216 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6217{
6218 uint16_t const fFcw = pFpuState->FCW;
6219 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6220
6221 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6222 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6223 {
6224 if (fFcw & X86_FCW_IM)
6225 pFpuRes->r80Result = g_r80Indefinite;
6226 else
6227 {
6228 pFpuRes->r80Result = *pr80Val1;
6229 fFsw |= X86_FSW_ES | X86_FSW_B;
6230 }
6231 fFsw |= X86_FSW_IE;
6232 }
6233 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6234 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6235 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6236 {
6237 if (fFcw & X86_FCW_DM)
6238 {
6239 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6240 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6241 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6242 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6243 }
6244 else
6245 {
6246 pFpuRes->r80Result = *pr80Val1;
6247 fFsw |= X86_FSW_ES | X86_FSW_B;
6248 }
6249 fFsw |= X86_FSW_DE;
6250 }
6251 /* SoftFloat can handle the rest: */
6252 else
6253 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6254
6255 pFpuRes->FSW = fFsw;
6256}
6257
6258
6259EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6260EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6261EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6262EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6263
6264
6265/*********************************************************************************************************************************
6266* x87 FPU Trigometric Operations *
6267*********************************************************************************************************************************/
6268static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6269{
6270 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6271 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6272 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6273 extFloat80_t v;
6274 (void)fFcw;
6275
6276 v = extF80_atan2(y, x, &SoftState);
6277
6278 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6279 return fFsw;
6280}
6281
6282IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6283 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6284{
6285 uint16_t const fFcw = pFpuState->FCW;
6286 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6287
6288 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6289 {
6290 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6291
6292 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6293 if (!(fFcw & X86_FCW_PM))
6294 fFsw |= X86_FSW_ES | X86_FSW_B;
6295 }
6296 else
6297 {
6298 fFsw |= X86_FSW_IE;
6299 if (!(fFcw & X86_FCW_IM))
6300 {
6301 pFpuRes->r80Result = *pr80Val2;
6302 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6303 }
6304 else
6305 {
6306 pFpuRes->r80Result = g_r80Indefinite;
6307 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6308 }
6309 }
6310
6311 pFpuRes->FSW = fFsw;
6312}
6313#endif /* IEM_WITHOUT_ASSEMBLY */
6314
6315IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6316 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6317{
6318 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6319}
6320
6321IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6322 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6323{
6324 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6325}
6326
6327
6328#if defined(IEM_WITHOUT_ASSEMBLY)
6329static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6330{
6331 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6332 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6333 extFloat80_t v;
6334 (void)fFcw;
6335
6336 v = extF80_tan(x, &SoftState);
6337
6338 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6339 return fFsw;
6340}
6341
6342IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6343{
6344 uint16_t const fFcw = pFpuState->FCW;
6345 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6346
6347 if (RTFLOAT80U_IS_ZERO(pr80Val))
6348 {
6349 pFpuResTwo->r80Result1 = *pr80Val;
6350 pFpuResTwo->r80Result2 = g_ar80One[0];
6351 }
6352 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6353 {
6354 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6355 {
6356 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6357 pFpuResTwo->r80Result1 = *pr80Val;
6358 }
6359 else
6360 {
6361 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6362 {
6363 pFpuResTwo->r80Result1 = *pr80Val;
6364 }
6365 else
6366 {
6367 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6368 }
6369
6370 pFpuResTwo->r80Result2 = g_ar80One[0];
6371
6372 fFsw |= X86_FSW_PE;
6373 if (!(fFcw & X86_FCW_PM))
6374 fFsw |= X86_FSW_ES | X86_FSW_B;
6375 }
6376 }
6377 else
6378 {
6379 fFsw |= X86_FSW_IE;
6380 if (!(fFcw & X86_FCW_IM))
6381 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6382 }
6383
6384 pFpuResTwo->FSW = fFsw;
6385}
6386#endif /* IEM_WITHOUT_ASSEMBLY */
6387
6388IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6389{
6390 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6391}
6392
6393IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6394{
6395 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6396}
6397
6398#ifdef IEM_WITHOUT_ASSEMBLY
6399
6400static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6401{
6402 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6403 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6404 extFloat80_t v;
6405 (void)fFcw;
6406
6407 v = extF80_sin(x, &SoftState);
6408
6409 iemFpuSoftF80ToIprt(pr80Result, v);
6410
6411 return fFsw;
6412}
6413
6414IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6415{
6416 uint16_t const fFcw = pFpuState->FCW;
6417 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6418
6419 if (RTFLOAT80U_IS_ZERO(pr80Val))
6420 {
6421 pFpuRes->r80Result = *pr80Val;
6422 }
6423 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6424 {
6425 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6426 {
6427 fFsw |= X86_FSW_C2;
6428 pFpuRes->r80Result = *pr80Val;
6429 }
6430 else
6431 {
6432 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6433 {
6434 pFpuRes->r80Result = *pr80Val;
6435 }
6436 else
6437 {
6438 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6439 }
6440 fFsw |= X86_FSW_PE;
6441 if (!(fFcw & X86_FCW_PM))
6442 fFsw |= X86_FSW_ES | X86_FSW_B;
6443 }
6444 }
6445 else if (RTFLOAT80U_IS_INF(pr80Val))
6446 {
6447 fFsw |= X86_FSW_IE;
6448 if (!(fFcw & X86_FCW_IM))
6449 {
6450 fFsw |= X86_FSW_ES | X86_FSW_B;
6451 pFpuRes->r80Result = *pr80Val;
6452 }
6453 else
6454 {
6455 pFpuRes->r80Result = g_r80Indefinite;
6456 }
6457 }
6458 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6459 {
6460 fFsw |= X86_FSW_DE;
6461
6462 if (fFcw & X86_FCW_DM)
6463 {
6464 if (fFcw & X86_FCW_UM)
6465 {
6466 pFpuRes->r80Result = *pr80Val;
6467 }
6468 else
6469 {
6470 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6471 uint64_t uMantissa = pr80Val->s.uMantissa;
6472 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6473
6474 uExponent = 64 - uExponent;
6475 uMantissa <<= uExponent;
6476 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6477
6478 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6479 pFpuRes->r80Result.s.uMantissa = uMantissa;
6480 pFpuRes->r80Result.s.uExponent = uExponent;
6481 }
6482
6483 fFsw |= X86_FSW_UE | X86_FSW_PE;
6484
6485 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6486 {
6487 /* All the exceptions are masked. */
6488 }
6489 else
6490 {
6491 fFsw |= X86_FSW_ES | X86_FSW_B;
6492 }
6493 }
6494 else
6495 {
6496 pFpuRes->r80Result = *pr80Val;
6497
6498 fFsw |= X86_FSW_ES | X86_FSW_B;
6499 }
6500 }
6501 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6502 {
6503 pFpuRes->r80Result = *pr80Val;
6504 fFsw |= X86_FSW_DE;
6505
6506 if (fFcw & X86_FCW_DM)
6507 {
6508 if (fFcw & X86_FCW_PM)
6509 {
6510 fFsw |= X86_FSW_PE;
6511 }
6512 else
6513 {
6514 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6515 }
6516
6517 pFpuRes->r80Result.sj64.uExponent = 1;
6518 }
6519 else
6520 {
6521 fFsw |= X86_FSW_ES | X86_FSW_B;
6522 }
6523 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6524 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6525 {
6526 pFpuRes->r80Result = *pr80Val;
6527 } else {
6528 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6529 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6530 && (fFcw & X86_FCW_IM))
6531 pFpuRes->r80Result = g_r80Indefinite;
6532 else
6533 {
6534 pFpuRes->r80Result = *pr80Val;
6535 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6536 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6537 }
6538
6539 fFsw |= X86_FSW_IE;
6540 if (!(fFcw & X86_FCW_IM))
6541 fFsw |= X86_FSW_ES | X86_FSW_B;
6542 }
6543
6544 pFpuRes->FSW = fFsw;
6545}
6546#endif /* IEM_WITHOUT_ASSEMBLY */
6547
6548IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6549{
6550 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6551}
6552
6553IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6554{
6555 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6556}
6557
6558#ifdef IEM_WITHOUT_ASSEMBLY
6559
6560static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6561{
6562 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6563 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6564 extFloat80_t v;
6565 (void)fFcw;
6566
6567 v = extF80_cos(x, &SoftState);
6568
6569 iemFpuSoftF80ToIprt(pr80Result, v);
6570
6571 return fFsw;
6572}
6573
6574IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6575{
6576 uint16_t const fFcw = pFpuState->FCW;
6577 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6578
6579 if (RTFLOAT80U_IS_ZERO(pr80Val))
6580 {
6581 pFpuRes->r80Result = g_ar80One[0];
6582 }
6583 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6584 {
6585 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6586 {
6587 fFsw |= X86_FSW_C2;
6588 pFpuRes->r80Result = *pr80Val;
6589 }
6590 else
6591 {
6592 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6593 {
6594 pFpuRes->r80Result = g_ar80One[0];
6595
6596 }
6597 else
6598 {
6599 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6600 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6601 }
6602 fFsw |= X86_FSW_PE;
6603 if (!(fFcw & X86_FCW_PM))
6604 fFsw |= X86_FSW_ES | X86_FSW_B;
6605 }
6606 }
6607 else if (RTFLOAT80U_IS_INF(pr80Val))
6608 {
6609 fFsw |= X86_FSW_IE;
6610 if (!(fFcw & X86_FCW_IM))
6611 {
6612 fFsw |= X86_FSW_ES | X86_FSW_B;
6613 pFpuRes->r80Result = *pr80Val;
6614 }
6615 else
6616 {
6617 pFpuRes->r80Result = g_r80Indefinite;
6618 }
6619 }
6620 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6621 {
6622 fFsw |= X86_FSW_DE;
6623
6624 if (fFcw & X86_FCW_DM)
6625 {
6626 pFpuRes->r80Result = g_ar80One[0];
6627
6628 if (fFcw & X86_FCW_PM)
6629 {
6630 fFsw |= X86_FSW_PE;
6631 }
6632 else
6633 {
6634 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6635 }
6636 }
6637 else
6638 {
6639 pFpuRes->r80Result = *pr80Val;
6640 fFsw |= X86_FSW_ES | X86_FSW_B;
6641 }
6642 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6643 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6644 {
6645 pFpuRes->r80Result = *pr80Val;
6646 } else {
6647 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6648 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6649 && (fFcw & X86_FCW_IM))
6650 pFpuRes->r80Result = g_r80Indefinite;
6651 else
6652 {
6653 pFpuRes->r80Result = *pr80Val;
6654 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6655 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6656 }
6657
6658 fFsw |= X86_FSW_IE;
6659 if (!(fFcw & X86_FCW_IM))
6660 fFsw |= X86_FSW_ES | X86_FSW_B;
6661 }
6662
6663 pFpuRes->FSW = fFsw;
6664}
6665#endif /* IEM_WITHOUT_ASSEMBLY */
6666
6667IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6668{
6669 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6670}
6671
6672IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6673{
6674 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6675}
6676
6677#ifdef IEM_WITHOUT_ASSEMBLY
6678
6679static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6680{
6681 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6682 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6683 extFloat80_t r80Sin, r80Cos;
6684 (void)fFcw;
6685
6686 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6687
6688 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6689 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6690
6691 return fFsw;
6692}
6693
6694IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6695{
6696 uint16_t const fFcw = pFpuState->FCW;
6697 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6698
6699 if (RTFLOAT80U_IS_ZERO(pr80Val))
6700 {
6701 pFpuResTwo->r80Result1 = *pr80Val;
6702 pFpuResTwo->r80Result2 = g_ar80One[0];
6703 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6704 }
6705 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6706 {
6707 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6708 {
6709 fFsw |= X86_FSW_C2;
6710
6711 if (fFcw & X86_FCW_IM)
6712 {
6713 pFpuResTwo->r80Result1 = g_r80Indefinite;
6714 }
6715 else
6716 {
6717 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6718 }
6719
6720 pFpuResTwo->r80Result2 = *pr80Val;
6721 }
6722 else
6723 {
6724 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6725
6726 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6727 {
6728 pFpuResTwo->r80Result1 = *pr80Val;
6729 pFpuResTwo->r80Result2 = g_ar80One[0];
6730 }
6731 else
6732 {
6733 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6734 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6735 }
6736 fFsw |= X86_FSW_PE;
6737 if (!(fFcw & X86_FCW_PM))
6738 fFsw |= X86_FSW_ES | X86_FSW_B;
6739 }
6740 }
6741 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6742 {
6743 fFsw |= X86_FSW_DE;
6744
6745 if (fFcw & X86_FCW_DM)
6746 {
6747 pFpuResTwo->r80Result1 = *pr80Val;
6748 pFpuResTwo->r80Result2 = g_ar80One[0];
6749 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6750
6751 if (fFcw & X86_FCW_PM)
6752 {
6753 fFsw |= X86_FSW_PE;
6754 }
6755 else
6756 {
6757 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6758 }
6759
6760 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6761 }
6762 else
6763 {
6764 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6765 pFpuResTwo->r80Result2 = *pr80Val;
6766 fFsw |= X86_FSW_ES | X86_FSW_B;
6767 }
6768 }
6769 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6770 {
6771 fFsw |= X86_FSW_DE;
6772
6773 if (fFcw & X86_FCW_DM)
6774 {
6775 pFpuResTwo->r80Result2 = g_ar80One[0];
6776
6777 if (fFcw & X86_FCW_UM)
6778 {
6779 pFpuResTwo->r80Result1 = *pr80Val;
6780 }
6781 else
6782 {
6783 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6784 uint64_t uMantissa = pr80Val->s.uMantissa;
6785 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6786
6787 uExponent = 64 - uExponent;
6788 uMantissa <<= uExponent;
6789 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6790
6791 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6792 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6793 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6794 }
6795
6796 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6797 fFsw |= X86_FSW_UE | X86_FSW_PE;
6798
6799 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6800 {
6801 /* All the exceptions are masked. */
6802 }
6803 else
6804 {
6805 fFsw |= X86_FSW_ES | X86_FSW_B;
6806 }
6807 }
6808 else
6809 {
6810 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6811 pFpuResTwo->r80Result2 = *pr80Val;
6812 fFsw |= X86_FSW_ES | X86_FSW_B;
6813 }
6814 }
6815 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6816 {
6817 pFpuResTwo->r80Result1 = *pr80Val;
6818 pFpuResTwo->r80Result2 = *pr80Val;
6819 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6820 }
6821 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6822 {
6823 if (fFcw & X86_FCW_IM)
6824 {
6825 pFpuResTwo->r80Result1 = g_r80Indefinite;
6826 pFpuResTwo->r80Result2 = g_r80Indefinite;
6827 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6828 }
6829 else
6830 {
6831 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6832 pFpuResTwo->r80Result2 = *pr80Val;
6833 }
6834
6835 fFsw |= X86_FSW_IE;
6836 if (!(fFcw & X86_FCW_IM))
6837 fFsw |= X86_FSW_ES | X86_FSW_B;
6838 }
6839 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6840 {
6841 pFpuResTwo->r80Result1 = *pr80Val;
6842 pFpuResTwo->r80Result2 = *pr80Val;
6843
6844 if (fFcw & X86_FCW_IM)
6845 {
6846 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6847 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6848 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6849 }
6850 else
6851 {
6852 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6853 pFpuResTwo->r80Result2 = *pr80Val;
6854 }
6855
6856 fFsw |= X86_FSW_IE;
6857 if (!(fFcw & X86_FCW_IM))
6858 fFsw |= X86_FSW_ES | X86_FSW_B;
6859 }
6860 else if (RTFLOAT80U_IS_INF(pr80Val))
6861 {
6862 if (fFcw & X86_FCW_IM)
6863 {
6864 pFpuResTwo->r80Result1 = g_r80Indefinite;
6865 pFpuResTwo->r80Result2 = g_r80Indefinite;
6866 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6867 }
6868 else
6869 {
6870 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6871 pFpuResTwo->r80Result2 = *pr80Val;
6872 }
6873
6874 fFsw |= X86_FSW_IE;
6875 if (!(fFcw & X86_FCW_IM))
6876 fFsw |= X86_FSW_ES | X86_FSW_B;
6877 }
6878
6879 pFpuResTwo->FSW = fFsw;
6880}
6881#endif /* IEM_WITHOUT_ASSEMBLY */
6882
6883IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6884{
6885 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6886}
6887
6888IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6889{
6890 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6891}
6892
6893#ifdef IEM_WITHOUT_ASSEMBLY
6894
6895
6896/*********************************************************************************************************************************
6897* x87 FPU Compare and Testing Operations *
6898*********************************************************************************************************************************/
6899
6900IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6901{
6902 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6903
6904 if (RTFLOAT80U_IS_ZERO(pr80Val))
6905 fFsw |= X86_FSW_C3;
6906 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6907 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6908 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6909 {
6910 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6911 if (!(pFpuState->FCW & X86_FCW_DM))
6912 fFsw |= X86_FSW_ES | X86_FSW_B;
6913 }
6914 else
6915 {
6916 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6917 if (!(pFpuState->FCW & X86_FCW_IM))
6918 fFsw |= X86_FSW_ES | X86_FSW_B;
6919 }
6920
6921 *pu16Fsw = fFsw;
6922}
6923
6924
6925IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6926{
6927 RT_NOREF(pFpuState);
6928 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6929
6930 /* C1 = sign bit (always, even if empty Intel says). */
6931 if (pr80Val->s.fSign)
6932 fFsw |= X86_FSW_C1;
6933
6934 /* Classify the value in C0, C2, C3. */
6935 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6936 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6937 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6938 fFsw |= X86_FSW_C2;
6939 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6940 fFsw |= X86_FSW_C3;
6941 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6942 fFsw |= X86_FSW_C0;
6943 else if (RTFLOAT80U_IS_INF(pr80Val))
6944 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6945 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6946 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6947 /* whatever else: 0 */
6948
6949 *pu16Fsw = fFsw;
6950}
6951
6952
6953/**
6954 * Worker for fcom, fucom, and friends.
6955 */
6956static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6957 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6958{
6959 /*
6960 * Unpack the values.
6961 */
6962 bool const fSign1 = pr80Val1->s.fSign;
6963 int32_t iExponent1 = pr80Val1->s.uExponent;
6964 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6965
6966 bool const fSign2 = pr80Val2->s.fSign;
6967 int32_t iExponent2 = pr80Val2->s.uExponent;
6968 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6969
6970 /*
6971 * Check for invalid inputs.
6972 */
6973 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6974 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6975 {
6976 if (!(fFcw & X86_FCW_IM))
6977 fFsw |= X86_FSW_ES | X86_FSW_B;
6978 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6979 }
6980
6981 /*
6982 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6983 */
6984 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6985 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6986 {
6987 if ( fIeOnAllNaNs
6988 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6989 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6990 {
6991 fFsw |= X86_FSW_IE;
6992 if (!(fFcw & X86_FCW_IM))
6993 fFsw |= X86_FSW_ES | X86_FSW_B;
6994 }
6995 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6996 }
6997
6998 /*
6999 * Normalize the values.
7000 */
7001 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7002 {
7003 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7004 iExponent1 = 1;
7005 else
7006 {
7007 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7008 uMantissa1 <<= iExponent1;
7009 iExponent1 = 1 - iExponent1;
7010 }
7011 fFsw |= X86_FSW_DE;
7012 if (!(fFcw & X86_FCW_DM))
7013 fFsw |= X86_FSW_ES | X86_FSW_B;
7014 }
7015
7016 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7017 {
7018 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7019 iExponent2 = 1;
7020 else
7021 {
7022 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7023 uMantissa2 <<= iExponent2;
7024 iExponent2 = 1 - iExponent2;
7025 }
7026 fFsw |= X86_FSW_DE;
7027 if (!(fFcw & X86_FCW_DM))
7028 fFsw |= X86_FSW_ES | X86_FSW_B;
7029 }
7030
7031 /*
7032 * Test if equal (val1 == val2):
7033 */
7034 if ( uMantissa1 == uMantissa2
7035 && iExponent1 == iExponent2
7036 && ( fSign1 == fSign2
7037 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7038 fFsw |= X86_FSW_C3;
7039 /*
7040 * Test if less than (val1 < val2):
7041 */
7042 else if (fSign1 && !fSign2)
7043 fFsw |= X86_FSW_C0;
7044 else if (fSign1 == fSign2)
7045 {
7046 /* Zeros are problematic, however at the most one can be zero here. */
7047 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7048 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7049 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7050 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7051
7052 if ( fSign1
7053 ^ ( iExponent1 < iExponent2
7054 || ( iExponent1 == iExponent2
7055 && uMantissa1 < uMantissa2 ) ) )
7056 fFsw |= X86_FSW_C0;
7057 }
7058 /* else: No flags set if greater. */
7059
7060 return fFsw;
7061}
7062
7063
7064IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7065 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7066{
7067 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7068}
7069
7070
7071
7072
7073IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7074 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7075{
7076 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7077}
7078
7079
7080IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7081 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7082{
7083 RTFLOAT80U r80Val2;
7084 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7085 Assert(!fFsw || fFsw == X86_FSW_DE);
7086 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7087 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7088 {
7089 if (!(pFpuState->FCW & X86_FCW_DM))
7090 fFsw |= X86_FSW_ES | X86_FSW_B;
7091 *pfFsw |= fFsw;
7092 }
7093}
7094
7095
7096IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7097 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7098{
7099 RTFLOAT80U r80Val2;
7100 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7101 Assert(!fFsw || fFsw == X86_FSW_DE);
7102 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7103 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7104 {
7105 if (!(pFpuState->FCW & X86_FCW_DM))
7106 fFsw |= X86_FSW_ES | X86_FSW_B;
7107 *pfFsw |= fFsw;
7108 }
7109}
7110
7111
7112IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7113 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7114{
7115 RTFLOAT80U r80Val2;
7116 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7117 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7118}
7119
7120
7121IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7122 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7123{
7124 RTFLOAT80U r80Val2;
7125 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7126 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7127}
7128
7129
7130/**
7131 * Worker for fcomi & fucomi.
7132 */
7133static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7134 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7135{
7136 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7137 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7138 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7139 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7140
7141 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7142 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7143 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7144}
7145
7146
7147IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7148 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7149{
7150 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7151}
7152
7153
7154IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7155 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7156{
7157 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7158}
7159
7160
7161/*********************************************************************************************************************************
7162* x87 FPU Other Operations *
7163*********************************************************************************************************************************/
7164
7165/**
7166 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7167 */
7168static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7169{
7170 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7171 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7172 true /*exact / generate #PE */, &SoftState));
7173 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7174}
7175
7176
7177IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7178{
7179 uint16_t const fFcw = pFpuState->FCW;
7180 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7181
7182 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7183 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7184 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7185 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7186 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7187 || RTFLOAT80U_IS_INF(pr80Val))
7188 pFpuRes->r80Result = *pr80Val;
7189 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7190 {
7191 fFsw |= X86_FSW_DE;
7192 if (fFcw & X86_FCW_DM)
7193 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7194 else
7195 {
7196 pFpuRes->r80Result = *pr80Val;
7197 fFsw |= X86_FSW_ES | X86_FSW_B;
7198 }
7199 }
7200 else
7201 {
7202 if (fFcw & X86_FCW_IM)
7203 {
7204 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7205 pFpuRes->r80Result = g_r80Indefinite;
7206 else
7207 {
7208 pFpuRes->r80Result = *pr80Val;
7209 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7210 }
7211 }
7212 else
7213 {
7214 pFpuRes->r80Result = *pr80Val;
7215 fFsw |= X86_FSW_ES | X86_FSW_B;
7216 }
7217 fFsw |= X86_FSW_IE;
7218 }
7219 pFpuRes->FSW = fFsw;
7220}
7221
7222
7223IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7224 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7225{
7226 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7227 it does everything we need it to do. */
7228 uint16_t const fFcw = pFpuState->FCW;
7229 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7230 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7231 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7232 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7233}
7234
7235
7236/**
7237 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7238 */
7239static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7240{
7241 Assert(!pr80Val->s.fSign);
7242 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7243 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7244 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7245}
7246
7247
7248IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7249{
7250 uint16_t const fFcw = pFpuState->FCW;
7251 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7252
7253 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7254 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7255 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7256 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7257 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7258 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7259 pFpuRes->r80Result = *pr80Val;
7260 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7261 {
7262 fFsw |= X86_FSW_DE;
7263 if (fFcw & X86_FCW_DM)
7264 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7265 else
7266 {
7267 pFpuRes->r80Result = *pr80Val;
7268 fFsw |= X86_FSW_ES | X86_FSW_B;
7269 }
7270 }
7271 else
7272 {
7273 if (fFcw & X86_FCW_IM)
7274 {
7275 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7276 pFpuRes->r80Result = g_r80Indefinite;
7277 else
7278 {
7279 pFpuRes->r80Result = *pr80Val;
7280 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7281 }
7282 }
7283 else
7284 {
7285 pFpuRes->r80Result = *pr80Val;
7286 fFsw |= X86_FSW_ES | X86_FSW_B;
7287 }
7288 fFsw |= X86_FSW_IE;
7289 }
7290 pFpuRes->FSW = fFsw;
7291}
7292
7293
7294/**
7295 * @code{.unparsed}
7296 * x x * ln2
7297 * f(x) = 2 - 1 = e - 1
7298 *
7299 * @endcode
7300 *
7301 * We can approximate e^x by a Taylor/Maclaurin series (see
7302 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7303 * @code{.unparsed}
7304 * n 0 1 2 3 4
7305 * inf x x x x x x
7306 * SUM ----- = --- + --- + --- + --- + --- + ...
7307 * n=0 n! 0! 1! 2! 3! 4!
7308 *
7309 * 2 3 4
7310 * x x x
7311 * = 1 + x + --- + --- + --- + ...
7312 * 2! 3! 4!
7313 * @endcode
7314 *
7315 * Given z = x * ln2, we get:
7316 * @code{.unparsed}
7317 * 2 3 4 n
7318 * z z z z z
7319 * e - 1 = z + --- + --- + --- + ... + ---
7320 * 2! 3! 4! n!
7321 * @endcode
7322 *
7323 * Wanting to use Horner's method, we move one z outside and get:
7324 * @code{.unparsed}
7325 * 2 3 (n-1)
7326 * z z z z
7327 * = z ( 1 + --- + --- + --- + ... + ------- )
7328 * 2! 3! 4! n!
7329 * @endcode
7330 *
7331 * The constants we need for using Horner's methods are 1 and 1 / n!.
7332 *
7333 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7334 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7335 * and can approximate it to be 1.0. For a visual demonstration of this
7336 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7337 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7338 *
7339 *
7340 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7341 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7342 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7343 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7344 * blocks). (The one bit difference is probably an implicit one missing from
7345 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7346 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7347 * exponent.
7348 *
7349 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7350 * successfully reproduced the exact results from an Intel 10980XE, there is
7351 * always a portition of rounding differences. Not going to spend too much time
7352 * on getting this 100% the same, at least not now.
7353 *
7354 * P.S. If someone are really curious about 8087 and its contstants:
7355 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7356 *
7357 *
7358 * @param pr80Val The exponent value (x), less than 1.0, greater than
7359 * -1.0 and not zero. This can be a normal, denormal
7360 * or pseudo-denormal value.
7361 * @param pr80Result Where to return the result.
7362 * @param fFcw FPU control word.
7363 * @param fFsw FPU status word.
7364 */
7365static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7366{
7367 /* As mentioned above, we can skip the expensive polynomial calculation
7368 as it will be close enough to 1.0 that it makes no difference.
7369
7370 The cutoff point for intel 10980XE is exponents >= -69. Intel
7371 also seems to be using a 67-bit or 68-bit constant value, and we get
7372 a smattering of rounding differences if we go for higher precision. */
7373 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7374 {
7375 RTUINT256U u256;
7376 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7377 u256.QWords.qw0 |= 1; /* force #PE */
7378 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7379 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7380 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7381 : 1 - RTFLOAT80U_EXP_BIAS,
7382 fFcw, fFsw);
7383 }
7384 else
7385 {
7386#ifdef IEM_WITH_FLOAT128_FOR_FPU
7387 /* This approach is not good enough for small values, we end up with zero. */
7388 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7389 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7390 _Float128 rd128Result = powf128(2.0L, rd128Val);
7391 rd128Result -= 1.0L;
7392 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7393 iemFpuF128RestoreRounding(fOldRounding);
7394
7395# else
7396 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7397 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7398
7399 /* As mentioned above, enforce 68-bit internal mantissa width to better
7400 match the Intel 10980XE results. */
7401 unsigned const cPrecision = 68;
7402
7403 /* first calculate z = x * ln2 */
7404 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7405 cPrecision);
7406
7407 /* Then do the polynomial evaluation. */
7408 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7409 cPrecision, &SoftState);
7410 r = f128_mul(z, r, &SoftState);
7411
7412 /* Output the result. */
7413 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7414# endif
7415 }
7416 return fFsw;
7417}
7418
7419
7420IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7421{
7422 uint16_t const fFcw = pFpuState->FCW;
7423 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7424
7425 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7426 {
7427 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7428 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7429 else
7430 {
7431 /* Special case:
7432 2^+1.0 - 1.0 = 1.0
7433 2^-1.0 - 1.0 = -0.5 */
7434 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7435 && pr80Val->s.uMantissa == RT_BIT_64(63))
7436 {
7437 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7438 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7439 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7440 }
7441 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7442 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7443 else
7444 pFpuRes->r80Result = *pr80Val;
7445 fFsw |= X86_FSW_PE;
7446 if (!(fFcw & X86_FCW_PM))
7447 fFsw |= X86_FSW_ES | X86_FSW_B;
7448 }
7449 }
7450 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7451 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7452 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7453 pFpuRes->r80Result = *pr80Val;
7454 else if (RTFLOAT80U_IS_INF(pr80Val))
7455 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7456 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7457 {
7458 fFsw |= X86_FSW_DE;
7459 if (fFcw & X86_FCW_DM)
7460 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7461 else
7462 {
7463 pFpuRes->r80Result = *pr80Val;
7464 fFsw |= X86_FSW_ES | X86_FSW_B;
7465 }
7466 }
7467 else
7468 {
7469 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7470 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7471 && (fFcw & X86_FCW_IM))
7472 pFpuRes->r80Result = g_r80Indefinite;
7473 else
7474 {
7475 pFpuRes->r80Result = *pr80Val;
7476 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7477 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7478 }
7479 fFsw |= X86_FSW_IE;
7480 if (!(fFcw & X86_FCW_IM))
7481 fFsw |= X86_FSW_ES | X86_FSW_B;
7482 }
7483 pFpuRes->FSW = fFsw;
7484}
7485
7486#endif /* IEM_WITHOUT_ASSEMBLY */
7487
7488IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7489{
7490 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7491}
7492
7493IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7494{
7495 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7496}
7497
7498#ifdef IEM_WITHOUT_ASSEMBLY
7499
7500IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7501{
7502 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7503 pFpuRes->r80Result = *pr80Val;
7504 pFpuRes->r80Result.s.fSign = 0;
7505}
7506
7507
7508IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7509{
7510 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7511 pFpuRes->r80Result = *pr80Val;
7512 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7513}
7514
7515
7516IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7517{
7518 uint16_t const fFcw = pFpuState->FCW;
7519 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7520
7521 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7522 {
7523 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7524 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7525
7526 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7527 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7528 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7529 }
7530 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7531 {
7532 fFsw |= X86_FSW_ZE;
7533 if (fFcw & X86_FCW_ZM)
7534 {
7535 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7536 pFpuResTwo->r80Result2 = *pr80Val;
7537 }
7538 else
7539 {
7540 pFpuResTwo->r80Result2 = *pr80Val;
7541 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7542 }
7543 }
7544 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7545 {
7546 fFsw |= X86_FSW_DE;
7547 if (fFcw & X86_FCW_DM)
7548 {
7549 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7550 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7551 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7552 int32_t iExponent = -16382;
7553 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7554 {
7555 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7556 iExponent--;
7557 }
7558
7559 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7560 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7561 }
7562 else
7563 {
7564 pFpuResTwo->r80Result2 = *pr80Val;
7565 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7566 }
7567 }
7568 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7569 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7570 {
7571 pFpuResTwo->r80Result1 = *pr80Val;
7572 pFpuResTwo->r80Result2 = *pr80Val;
7573 }
7574 else if (RTFLOAT80U_IS_INF(pr80Val))
7575 {
7576 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7577 pFpuResTwo->r80Result2 = *pr80Val;
7578 }
7579 else
7580 {
7581 if (fFcw & X86_FCW_IM)
7582 {
7583 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7584 pFpuResTwo->r80Result1 = g_r80Indefinite;
7585 else
7586 {
7587 pFpuResTwo->r80Result1 = *pr80Val;
7588 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7589 }
7590 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7591 }
7592 else
7593 {
7594 pFpuResTwo->r80Result2 = *pr80Val;
7595 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7596 }
7597 fFsw |= X86_FSW_IE;
7598 }
7599 pFpuResTwo->FSW = fFsw;
7600}
7601#endif /* IEM_WITHOUT_ASSEMBLY */
7602
7603#if defined(IEM_WITHOUT_ASSEMBLY)
7604
7605static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7606{
7607 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7608 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7609 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7610 extFloat80_t v;
7611 (void)fFcw;
7612
7613 v = extF80_ylog2x(y, x, &SoftState);
7614 iemFpuSoftF80ToIprt(pr80Result, v);
7615
7616 return fFsw;
7617}
7618
7619IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7620 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7621{
7622 uint16_t const fFcw = pFpuState->FCW;
7623 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7624
7625 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7626 {
7627 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7628
7629 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7630 if (!(fFcw & X86_FCW_PM))
7631 fFsw |= X86_FSW_ES | X86_FSW_B;
7632 }
7633 else
7634 {
7635 fFsw |= X86_FSW_IE;
7636
7637 if (!(fFcw & X86_FCW_IM))
7638 {
7639 pFpuRes->r80Result = *pr80Val2;
7640 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7641 }
7642 else
7643 {
7644 pFpuRes->r80Result = g_r80Indefinite;
7645 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7646 }
7647 }
7648
7649 pFpuRes->FSW = fFsw;
7650}
7651#endif /* IEM_WITHOUT_ASSEMBLY */
7652
7653IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7654 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7655{
7656 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7657}
7658
7659IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7660 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7661{
7662 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7663}
7664
7665#if defined(IEM_WITHOUT_ASSEMBLY)
7666
7667static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7668{
7669 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7670 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7671 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7672 extFloat80_t v;
7673 (void)fFcw;
7674
7675 v = extF80_ylog2xp1(y, x, &SoftState);
7676 iemFpuSoftF80ToIprt(pr80Result, v);
7677
7678 return fFsw;
7679}
7680
7681IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7682 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7683{
7684 uint16_t const fFcw = pFpuState->FCW;
7685 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7686
7687 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7688 {
7689 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7690
7691 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7692 if (!(fFcw & X86_FCW_PM))
7693 fFsw |= X86_FSW_ES | X86_FSW_B;
7694 }
7695 else
7696 {
7697 fFsw |= X86_FSW_IE;
7698
7699 if (!(fFcw & X86_FCW_IM))
7700 {
7701 pFpuRes->r80Result = *pr80Val2;
7702 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7703 }
7704 else
7705 {
7706 pFpuRes->r80Result = g_r80Indefinite;
7707 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7708 }
7709 }
7710
7711 pFpuRes->FSW = fFsw;
7712}
7713
7714#endif /* IEM_WITHOUT_ASSEMBLY */
7715
7716IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7717 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7718{
7719 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7720}
7721
7722IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7723 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7724{
7725 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7726}
7727
7728
7729/*********************************************************************************************************************************
7730* MMX, SSE & AVX *
7731*********************************************************************************************************************************/
7732
7733#ifdef IEM_WITH_VEX
7734
7735/*
7736 * VMOVSLDUP
7737 */
7738IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7739{
7740 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7741 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7742 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7743 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7744 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7745 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7746 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7747 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7748}
7749
7750
7751IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7752{
7753 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7754 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7755 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7756 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7757 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7758 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7759 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7760 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7761}
7762
7763#endif /* IEM_WITH_VEX */
7764
7765
7766#ifdef IEM_WITH_VEX
7767
7768/*
7769 * VMOVSHDUP
7770 */
7771IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7772{
7773 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7774 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7775 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7776 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7777 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7778 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7779 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7780 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7781}
7782
7783
7784IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7785{
7786 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7787 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7788 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7789 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7790 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7791 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7792 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7793 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7794}
7795
7796#endif /* IEM_WITH_VEX */
7797
7798
7799#ifdef IEM_WITH_VEX
7800
7801/*
7802 * VMOVDDUP
7803 */
7804IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7805{
7806 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7807 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7808 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7809 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7810}
7811
7812IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7813{
7814 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7815 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7816 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7817 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7818}
7819
7820#endif /* IEM_WITH_VEX */
7821
7822
7823/*
7824 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7825 */
7826#ifdef IEM_WITHOUT_ASSEMBLY
7827
7828IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7829{
7830 RT_NOREF(pFpuState);
7831 *puDst &= *puSrc;
7832}
7833
7834
7835IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7836{
7837 RT_NOREF(pFpuState);
7838 puDst->au64[0] &= puSrc->au64[0];
7839 puDst->au64[1] &= puSrc->au64[1];
7840}
7841
7842#endif
7843
7844IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7845 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7846{
7847 RT_NOREF(pExtState);
7848 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7849 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7850}
7851
7852
7853IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7854 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7855{
7856 RT_NOREF(pExtState);
7857 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7858 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7859 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7860 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7861}
7862
7863
7864/*
7865 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7866 */
7867#ifdef IEM_WITHOUT_ASSEMBLY
7868
7869IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7870{
7871 RT_NOREF(pFpuState);
7872 *puDst = ~*puDst & *puSrc;
7873}
7874
7875
7876IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7877{
7878 RT_NOREF(pFpuState);
7879 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7880 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7881}
7882
7883#endif
7884
7885IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7886 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7887{
7888 RT_NOREF(pExtState);
7889 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7890 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7891}
7892
7893
7894IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7895 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7896{
7897 RT_NOREF(pExtState);
7898 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7899 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7900 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7901 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7902}
7903
7904
7905/*
7906 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7907 */
7908#ifdef IEM_WITHOUT_ASSEMBLY
7909
7910IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7911{
7912 RT_NOREF(pFpuState);
7913 *puDst |= *puSrc;
7914}
7915
7916
7917IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7918{
7919 RT_NOREF(pFpuState);
7920 puDst->au64[0] |= puSrc->au64[0];
7921 puDst->au64[1] |= puSrc->au64[1];
7922}
7923
7924#endif
7925
7926IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7927 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7928{
7929 RT_NOREF(pExtState);
7930 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7931 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7932}
7933
7934
7935IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7936 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7937{
7938 RT_NOREF(pExtState);
7939 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7940 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7941 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7942 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7943}
7944
7945
7946/*
7947 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7948 */
7949#ifdef IEM_WITHOUT_ASSEMBLY
7950
7951IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7952{
7953 RT_NOREF(pFpuState);
7954 *puDst ^= *puSrc;
7955}
7956
7957
7958IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7959{
7960 RT_NOREF(pFpuState);
7961 puDst->au64[0] ^= puSrc->au64[0];
7962 puDst->au64[1] ^= puSrc->au64[1];
7963}
7964
7965#endif
7966
7967IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7968 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7969{
7970 RT_NOREF(pExtState);
7971 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7972 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7973}
7974
7975
7976IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7977 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7978{
7979 RT_NOREF(pExtState);
7980 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7981 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7982 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7983 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7984}
7985
7986
7987/*
7988 * PCMPEQB / VPCMPEQB
7989 */
7990#ifdef IEM_WITHOUT_ASSEMBLY
7991
7992IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7993{
7994 RT_NOREF(pFpuState);
7995 RTUINT64U uSrc1 = { *puDst };
7996 RTUINT64U uSrc2 = { *puSrc };
7997 RTUINT64U uDst;
7998 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7999 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
8000 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
8001 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
8002 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
8003 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
8004 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
8005 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
8006 *puDst = uDst.u;
8007}
8008
8009
8010IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8011{
8012 RT_NOREF(pFpuState);
8013 RTUINT128U uSrc1 = *puDst;
8014 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
8015 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
8016 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
8017 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
8018 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
8019 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
8020 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
8021 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
8022 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
8023 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
8024 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
8025 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
8026 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
8027 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
8028 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
8029 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
8030}
8031
8032#endif
8033
8034IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8035 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8036{
8037 RT_NOREF(pExtState);
8038 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8039 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8040 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8041 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8042 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8043 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8044 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8045 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8046 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8047 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8048 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8049 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8050 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8051 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8052 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8053 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8054}
8055
8056IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8057 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8058{
8059 RT_NOREF(pExtState);
8060 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8061 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8062 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8063 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8064 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8065 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8066 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8067 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8068 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8069 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8070 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8071 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8072 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8073 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8074 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8075 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8076 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8077 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8078 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8079 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8080 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8081 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8082 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8083 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8084 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8085 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8086 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8087 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8088 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8089 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8090 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8091 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8092}
8093
8094
8095/*
8096 * PCMPEQW / VPCMPEQW
8097 */
8098#ifdef IEM_WITHOUT_ASSEMBLY
8099
8100IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8101{
8102 RT_NOREF(pFpuState);
8103 RTUINT64U uSrc1 = { *puDst };
8104 RTUINT64U uSrc2 = { *puSrc };
8105 RTUINT64U uDst;
8106 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8107 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8108 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8109 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8110 *puDst = uDst.u;
8111}
8112
8113
8114IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8115{
8116 RT_NOREF(pFpuState);
8117 RTUINT128U uSrc1 = *puDst;
8118 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8119 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8120 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8121 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8122 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8123 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8124 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8125 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8126}
8127
8128#endif
8129
8130IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8131 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8132{
8133 RT_NOREF(pExtState);
8134 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8135 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8136 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8137 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8138 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8139 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8140 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8141 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8142}
8143
8144IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8145 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8146{
8147 RT_NOREF(pExtState);
8148 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8149 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8150 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8151 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8152 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8153 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8154 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8155 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8156 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8157 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8158 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8159 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8160 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8161 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8162 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8163 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8164}
8165
8166
8167/*
8168 * PCMPEQD / VPCMPEQD.
8169 */
8170#ifdef IEM_WITHOUT_ASSEMBLY
8171
8172IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8173{
8174 RT_NOREF(pFpuState);
8175 RTUINT64U uSrc1 = { *puDst };
8176 RTUINT64U uSrc2 = { *puSrc };
8177 RTUINT64U uDst;
8178 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8179 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8180 *puDst = uDst.u;
8181}
8182
8183
8184IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8185{
8186 RT_NOREF(pFpuState);
8187 RTUINT128U uSrc1 = *puDst;
8188 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8189 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8190 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8191 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8192}
8193
8194#endif /* IEM_WITHOUT_ASSEMBLY */
8195
8196IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8197 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8198{
8199 RT_NOREF(pExtState);
8200 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8201 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8202 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8203 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8204}
8205
8206IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8207 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8208{
8209 RT_NOREF(pExtState);
8210 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8211 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8212 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8213 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8214 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8215 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8216 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8217 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8218}
8219
8220
8221/*
8222 * PCMPEQQ / VPCMPEQQ.
8223 */
8224IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8225{
8226 RT_NOREF(pFpuState);
8227 RTUINT128U uSrc1 = *puDst;
8228 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8229 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8230}
8231
8232IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8233 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8234{
8235 RT_NOREF(pExtState);
8236 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8237 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8238}
8239
8240IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8241 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8242{
8243 RT_NOREF(pExtState);
8244 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8245 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8246 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8247 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8248}
8249
8250
8251/*
8252 * PCMPGTB / VPCMPGTB
8253 */
8254#ifdef IEM_WITHOUT_ASSEMBLY
8255
8256IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8257{
8258 RT_NOREF(pFpuState);
8259 RTUINT64U uSrc1 = { *puDst };
8260 RTUINT64U uSrc2 = { *puSrc };
8261 RTUINT64U uDst;
8262 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8263 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8264 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8265 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8266 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8267 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8268 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8269 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8270 *puDst = uDst.u;
8271}
8272
8273
8274IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8275{
8276 RT_NOREF(pFpuState);
8277 RTUINT128U uSrc1 = *puDst;
8278 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8279 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8280 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8281 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8282 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8283 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8284 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8285 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8286 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8287 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8288 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8289 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8290 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8291 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8292 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8293 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8294}
8295
8296#endif
8297
8298IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8299 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8300{
8301 RT_NOREF(pExtState);
8302 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8303 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8304 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8305 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8306 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8307 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8308 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8309 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8310 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8311 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8312 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8313 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8314 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8315 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8316 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8317 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8318}
8319
8320IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8321 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8322{
8323 RT_NOREF(pExtState);
8324 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8325 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8326 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8327 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8328 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8329 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8330 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8331 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8332 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8333 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8334 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8335 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8336 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8337 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8338 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8339 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8340 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8341 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8342 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8343 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8344 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8345 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8346 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8347 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8348 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8349 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8350 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8351 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8352 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8353 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8354 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8355 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8356}
8357
8358
8359/*
8360 * PCMPGTW / VPCMPGTW
8361 */
8362#ifdef IEM_WITHOUT_ASSEMBLY
8363
8364IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8365{
8366 RT_NOREF(pFpuState);
8367 RTUINT64U uSrc1 = { *puDst };
8368 RTUINT64U uSrc2 = { *puSrc };
8369 RTUINT64U uDst;
8370 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8371 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8372 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8373 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8374 *puDst = uDst.u;
8375}
8376
8377
8378IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8379{
8380 RT_NOREF(pFpuState);
8381 RTUINT128U uSrc1 = *puDst;
8382 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8383 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8384 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8385 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8386 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8387 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8388 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8389 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8390}
8391
8392#endif
8393
8394IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8395 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8396{
8397 RT_NOREF(pExtState);
8398 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8399 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8400 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8401 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8402 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8403 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8404 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8405 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8406}
8407
8408IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8409 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8410{
8411 RT_NOREF(pExtState);
8412 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8413 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8414 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8415 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8416 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8417 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8418 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8419 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8420 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8421 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8422 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8423 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8424 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8425 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8426 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8427 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8428}
8429
8430
8431/*
8432 * PCMPGTD / VPCMPGTD.
8433 */
8434#ifdef IEM_WITHOUT_ASSEMBLY
8435
8436IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8437{
8438 RT_NOREF(pFpuState);
8439 RTUINT64U uSrc1 = { *puDst };
8440 RTUINT64U uSrc2 = { *puSrc };
8441 RTUINT64U uDst;
8442 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8443 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8444 *puDst = uDst.u;
8445}
8446
8447
8448IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8449{
8450 RT_NOREF(pFpuState);
8451 RTUINT128U uSrc1 = *puDst;
8452 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8453 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8454 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8455 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8456}
8457
8458#endif /* IEM_WITHOUT_ASSEMBLY */
8459
8460IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8461 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8462{
8463 RT_NOREF(pExtState);
8464 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8465 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8466 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8467 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8468}
8469
8470IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8471 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8472{
8473 RT_NOREF(pExtState);
8474 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8475 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8476 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8477 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8478 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8479 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8480 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8481 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8482}
8483
8484
8485/*
8486 * PCMPGTQ / VPCMPGTQ.
8487 */
8488IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8489{
8490 RT_NOREF(pFpuState);
8491 RTUINT128U uSrc1 = *puDst;
8492 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8493 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8494}
8495
8496IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8497 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8498{
8499 RT_NOREF(pExtState);
8500 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8501 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8502}
8503
8504IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8505 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8506{
8507 RT_NOREF(pExtState);
8508 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8509 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8510 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8511 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8512}
8513
8514
8515/*
8516 * PADDB / VPADDB
8517 */
8518#ifdef IEM_WITHOUT_ASSEMBLY
8519
8520IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8521{
8522 RT_NOREF(pFpuState);
8523 RTUINT64U uSrc1 = { *puDst };
8524 RTUINT64U uSrc2 = { *puSrc };
8525 RTUINT64U uDst;
8526 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8527 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8528 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8529 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8530 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8531 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8532 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8533 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8534 *puDst = uDst.u;
8535}
8536
8537
8538IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8539{
8540 RT_NOREF(pFpuState);
8541 RTUINT128U uSrc1 = *puDst;
8542 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8543 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8544 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8545 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8546 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8547 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8548 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8549 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8550 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8551 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8552 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8553 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8554 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8555 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8556 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8557 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8558}
8559
8560#endif
8561
8562
8563IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8564 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8565{
8566 RT_NOREF(pExtState);
8567 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8568 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8569 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8570 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8571 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8572 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8573 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8574 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8575 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8576 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8577 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8578 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8579 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8580 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8581 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8582 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8583}
8584
8585IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8586 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8587{
8588 RT_NOREF(pExtState);
8589 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8590 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8591 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8592 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8593 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8594 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8595 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8596 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8597 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8598 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8599 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8600 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8601 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8602 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8603 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8604 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8605 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8606 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8607 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8608 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8609 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8610 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8611 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8612 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8613 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8614 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8615 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8616 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8617 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8618 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8619 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8620 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8621}
8622
8623
8624/*
8625 * PADDSB / VPADDSB
8626 */
8627#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8628 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8629 ? (uint8_t)(a_iWord) \
8630 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8631
8632#ifdef IEM_WITHOUT_ASSEMBLY
8633
8634IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8635{
8636 RT_NOREF(pFpuState);
8637 RTUINT64U uSrc1 = { *puDst };
8638 RTUINT64U uSrc2 = { *puSrc };
8639 RTUINT64U uDst;
8640 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8641 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8642 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8643 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8644 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8645 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8646 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8647 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8648 *puDst = uDst.u;
8649}
8650
8651
8652IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8653{
8654 RT_NOREF(pFpuState);
8655 RTUINT128U uSrc1 = *puDst;
8656 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8657 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8658 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8659 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8660 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8661 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8662 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8663 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8664 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8665 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8666 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8667 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8668 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8669 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8670 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8671 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8672}
8673
8674#endif
8675
8676IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8677 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8678{
8679 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8680 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8681 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8682 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8683 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8684 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8685 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8686 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8687 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8688 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8689 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8690 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8691 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8692 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8693 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8694 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8695}
8696
8697IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8698 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8699{
8700 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8701 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8702 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8703 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8704 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8705 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8706 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8707 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8708 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8709 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8710 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8711 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8712 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8713 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8714 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8715 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8716 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8717 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8718 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8719 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8720 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8721 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8722 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8723 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8724 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8725 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8726 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8727 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8728 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8729 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8730 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8731 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8732}
8733
8734
8735/*
8736 * PADDUSB / VPADDUSB
8737 */
8738#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8739 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8740 ? (uint8_t)(a_uWord) \
8741 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8742
8743#ifdef IEM_WITHOUT_ASSEMBLY
8744
8745IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8746{
8747 RT_NOREF(pFpuState);
8748 RTUINT64U uSrc1 = { *puDst };
8749 RTUINT64U uSrc2 = { *puSrc };
8750 RTUINT64U uDst;
8751 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8752 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8753 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8754 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8755 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8756 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8757 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8758 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8759 *puDst = uDst.u;
8760}
8761
8762
8763IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8764{
8765 RT_NOREF(pFpuState);
8766 RTUINT128U uSrc1 = *puDst;
8767 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8768 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8769 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8770 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8771 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8772 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8773 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8774 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8775 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8776 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8777 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8778 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8779 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8780 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8781 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8782 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8783}
8784
8785#endif
8786
8787IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8788 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8789{
8790 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8791 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8792 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8793 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8794 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8795 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8796 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8797 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8798 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8799 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8800 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8801 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8802 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8803 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8804 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8805 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8806}
8807
8808IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8809 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8810{
8811 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8812 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8813 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8814 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8815 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8816 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8817 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8818 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8819 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8820 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8821 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8822 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8823 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8824 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8825 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8826 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8827 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8828 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8829 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8830 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8831 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8832 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8833 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8834 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8835 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8836 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8837 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8838 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8839 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8840 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8841 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8842 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8843}
8844
8845
8846/*
8847 * PADDW / VPADDW
8848 */
8849#ifdef IEM_WITHOUT_ASSEMBLY
8850
8851IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8852{
8853 RT_NOREF(pFpuState);
8854 RTUINT64U uSrc1 = { *puDst };
8855 RTUINT64U uSrc2 = { *puSrc };
8856 RTUINT64U uDst;
8857 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8858 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8859 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8860 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8861 *puDst = uDst.u;
8862}
8863
8864
8865IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8866{
8867 RT_NOREF(pFpuState);
8868 RTUINT128U uSrc1 = *puDst;
8869 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8870 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8871 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8872 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8873 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8874 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8875 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8876 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8877}
8878
8879#endif
8880
8881
8882IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8883 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8884{
8885 RT_NOREF(pExtState);
8886 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8887 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8888 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8889 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8890 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8891 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8892 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8893 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8894}
8895
8896IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8897 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8898{
8899 RT_NOREF(pExtState);
8900 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8901 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8902 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8903 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8904 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8905 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8906 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8907 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8908 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8909 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8910 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8911 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8912 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8913 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8914 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8915 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8916}
8917
8918
8919/*
8920 * PADDSW / VPADDSW
8921 */
8922#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8923 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8924 ? (uint16_t)(a_iDword) \
8925 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8926
8927#ifdef IEM_WITHOUT_ASSEMBLY
8928
8929IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8930{
8931 RT_NOREF(pFpuState);
8932 RTUINT64U uSrc1 = { *puDst };
8933 RTUINT64U uSrc2 = { *puSrc };
8934 RTUINT64U uDst;
8935 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8936 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8937 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8938 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8939 *puDst = uDst.u;
8940}
8941
8942
8943IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8944{
8945 RT_NOREF(pFpuState);
8946 RTUINT128U uSrc1 = *puDst;
8947 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8948 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8949 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8950 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8951 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8952 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8953 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8954 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8955}
8956
8957#endif
8958
8959IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8960 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8961{
8962 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8963 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8964 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8965 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8966 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8967 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8968 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8969 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8970}
8971
8972IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8973 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8974{
8975 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8976 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8977 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8978 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8979 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8980 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8981 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8982 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8983 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8984 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8985 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8986 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8987 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8988 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8989 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8990 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8991}
8992
8993
8994/*
8995 * PADDUSW / VPADDUSW
8996 */
8997#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8998 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8999 ? (uint16_t)(a_uDword) \
9000 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
9001
9002#ifdef IEM_WITHOUT_ASSEMBLY
9003
9004IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9005{
9006 RT_NOREF(pFpuState);
9007 RTUINT64U uSrc1 = { *puDst };
9008 RTUINT64U uSrc2 = { *puSrc };
9009 RTUINT64U uDst;
9010 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
9011 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
9012 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
9013 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
9014 *puDst = uDst.u;
9015}
9016
9017
9018IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9019{
9020 RT_NOREF(pFpuState);
9021 RTUINT128U uSrc1 = *puDst;
9022 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
9023 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
9024 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
9025 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
9026 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
9027 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
9028 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
9029 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
9030}
9031
9032#endif
9033
9034IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
9035 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9036{
9037 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9038 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9039 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9040 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9041 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9042 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9043 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9044 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9045}
9046
9047IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
9048 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9049{
9050 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9051 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9052 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9053 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9054 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9055 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9056 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9057 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9058 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
9059 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
9060 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
9061 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
9062 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
9063 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
9064 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
9065 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
9066}
9067
9068
9069/*
9070 * PADDD / VPADDD.
9071 */
9072#ifdef IEM_WITHOUT_ASSEMBLY
9073
9074IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9075{
9076 RT_NOREF(pFpuState);
9077 RTUINT64U uSrc1 = { *puDst };
9078 RTUINT64U uSrc2 = { *puSrc };
9079 RTUINT64U uDst;
9080 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
9081 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
9082 *puDst = uDst.u;
9083}
9084
9085
9086IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9087{
9088 RT_NOREF(pFpuState);
9089 RTUINT128U uSrc1 = *puDst;
9090 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
9091 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
9092 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
9093 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
9094}
9095
9096#endif /* IEM_WITHOUT_ASSEMBLY */
9097
9098IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9099 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9100{
9101 RT_NOREF(pExtState);
9102 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9103 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9104 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9105 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9106}
9107
9108IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9109 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9110{
9111 RT_NOREF(pExtState);
9112 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9113 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9114 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9115 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9116 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
9117 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
9118 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
9119 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
9120}
9121
9122
9123/*
9124 * PADDQ / VPADDQ.
9125 */
9126#ifdef IEM_WITHOUT_ASSEMBLY
9127
9128IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9129{
9130 RT_NOREF(pFpuState);
9131 *puDst = *puDst + *puSrc;
9132}
9133
9134IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9135{
9136 RT_NOREF(pFpuState);
9137 RTUINT128U uSrc1 = *puDst;
9138 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
9139 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
9140}
9141
9142#endif
9143
9144IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9145 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9146{
9147 RT_NOREF(pExtState);
9148 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9149 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9150}
9151
9152IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9153 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9154{
9155 RT_NOREF(pExtState);
9156 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9157 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9158 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9159 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9160}
9161
9162
9163/*
9164 * PSUBB / VPSUBB
9165 */
9166#ifdef IEM_WITHOUT_ASSEMBLY
9167
9168IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9169{
9170 RT_NOREF(pFpuState);
9171 RTUINT64U uSrc1 = { *puDst };
9172 RTUINT64U uSrc2 = { *puSrc };
9173 RTUINT64U uDst;
9174 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9175 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9176 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9177 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9178 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9179 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9180 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9181 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9182 *puDst = uDst.u;
9183}
9184
9185
9186IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9187{
9188 RT_NOREF(pFpuState);
9189 RTUINT128U uSrc1 = *puDst;
9190 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9191 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9192 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9193 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9194 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9195 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9196 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9197 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9198 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9199 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9200 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9201 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9202 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9203 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9204 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9205 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9206}
9207
9208#endif
9209
9210IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9211 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9212{
9213 RT_NOREF(pExtState);
9214 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9215 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9216 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9217 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9218 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9219 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9220 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9221 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9222 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9223 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9224 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9225 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9226 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9227 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9228 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9229 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9230}
9231
9232IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9233 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9234{
9235 RT_NOREF(pExtState);
9236 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9237 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9238 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9239 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9240 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9241 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9242 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9243 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9244 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9245 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9246 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9247 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9248 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9249 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9250 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9251 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9252 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9253 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9254 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9255 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9256 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9257 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9258 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9259 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9260 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9261 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9262 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9263 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9264 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9265 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9266 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9267 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9268}
9269
9270
9271/*
9272 * PSUBSB / VSUBSB
9273 */
9274#ifdef IEM_WITHOUT_ASSEMBLY
9275
9276IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9277{
9278 RT_NOREF(pFpuState);
9279 RTUINT64U uSrc1 = { *puDst };
9280 RTUINT64U uSrc2 = { *puSrc };
9281 RTUINT64U uDst;
9282 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9283 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9284 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9285 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9286 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9287 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9288 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9289 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9290 *puDst = uDst.u;
9291}
9292
9293
9294IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9295{
9296 RT_NOREF(pFpuState);
9297 RTUINT128U uSrc1 = *puDst;
9298 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9299 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9300 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9301 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9302 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9303 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9304 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9305 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9306 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9307 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9308 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9309 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9310 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9311 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9312 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9313 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9314}
9315
9316#endif
9317
9318IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9319 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9320{
9321 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9322 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9323 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9324 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9325 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9326 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9327 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9328 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9329 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9330 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9331 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9332 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9333 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9334 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9335 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9336 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9337}
9338
9339IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9340 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9341{
9342 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9343 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9344 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9345 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9346 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9347 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9348 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9349 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9350 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9351 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9352 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9353 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9354 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9355 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9356 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9357 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9358 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9359 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9360 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9361 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9362 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9363 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9364 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9365 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9366 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9367 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9368 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9369 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9370 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9371 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9372 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9373 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9374}
9375
9376
9377/*
9378 * PSUBUSB / VPSUBUSW
9379 */
9380#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9381 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9382 ? (uint8_t)(a_uWord) \
9383 : (uint8_t)0 )
9384
9385#ifdef IEM_WITHOUT_ASSEMBLY
9386
9387IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9388{
9389 RT_NOREF(pFpuState);
9390 RTUINT64U uSrc1 = { *puDst };
9391 RTUINT64U uSrc2 = { *puSrc };
9392 RTUINT64U uDst;
9393 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9394 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9395 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9396 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9397 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9398 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9399 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9400 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9401 *puDst = uDst.u;
9402}
9403
9404
9405IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9406{
9407 RT_NOREF(pFpuState);
9408 RTUINT128U uSrc1 = *puDst;
9409 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9410 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9411 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9412 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9413 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9414 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9415 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9416 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9417 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9418 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9419 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9420 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9421 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9422 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9423 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9424 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9425}
9426
9427#endif
9428
9429IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9430 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9431{
9432 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9433 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9434 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9435 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9436 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9437 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9438 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9439 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9440 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9441 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9442 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9443 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9444 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9445 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9446 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9447 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9448}
9449
9450IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9451 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9452{
9453 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9454 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9455 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9456 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9457 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9458 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9459 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9460 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9461 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9462 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9463 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9464 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9465 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9466 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9467 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9468 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9469 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9470 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9471 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9472 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9473 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9474 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9475 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9476 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9477 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9478 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9479 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9480 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9481 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9482 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9483 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9484 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9485}
9486
9487
9488/*
9489 * PSUBW / VPSUBW
9490 */
9491#ifdef IEM_WITHOUT_ASSEMBLY
9492
9493IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9494{
9495 RT_NOREF(pFpuState);
9496 RTUINT64U uSrc1 = { *puDst };
9497 RTUINT64U uSrc2 = { *puSrc };
9498 RTUINT64U uDst;
9499 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9500 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9501 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9502 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9503 *puDst = uDst.u;
9504}
9505
9506
9507IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9508{
9509 RT_NOREF(pFpuState);
9510 RTUINT128U uSrc1 = *puDst;
9511 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9512 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9513 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9514 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9515 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9516 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9517 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9518 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9519}
9520
9521#endif
9522
9523IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9524 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9525{
9526 RT_NOREF(pExtState);
9527 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9528 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9529 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9530 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9531 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9532 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9533 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9534 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9535}
9536
9537IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9538 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9539{
9540 RT_NOREF(pExtState);
9541 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9542 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9543 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9544 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9545 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9546 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9547 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9548 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9549 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9550 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9551 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9552 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9553 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9554 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9555 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9556 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9557}
9558
9559
9560/*
9561 * PSUBSW / VPSUBSW
9562 */
9563#ifdef IEM_WITHOUT_ASSEMBLY
9564
9565IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9566{
9567 RT_NOREF(pFpuState);
9568 RTUINT64U uSrc1 = { *puDst };
9569 RTUINT64U uSrc2 = { *puSrc };
9570 RTUINT64U uDst;
9571 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9572 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9573 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9574 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9575 *puDst = uDst.u;
9576}
9577
9578
9579IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9580{
9581 RT_NOREF(pFpuState);
9582 RTUINT128U uSrc1 = *puDst;
9583 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9584 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9585 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9586 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9587 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9588 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9589 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9590 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9591}
9592
9593#endif
9594
9595IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9596 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9597{
9598 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9599 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9600 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9601 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9602 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9603 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9604 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9605 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9606}
9607
9608IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9609 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9610{
9611 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9612 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9613 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9614 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9615 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9616 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9617 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9618 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9619 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9620 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9621 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9622 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9623 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9624 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9625 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9626 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9627}
9628
9629
9630/*
9631 * PSUBUSW / VPSUBUSW
9632 */
9633#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9634 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9635 ? (uint16_t)(a_uDword) \
9636 : (uint16_t)0 )
9637
9638#ifdef IEM_WITHOUT_ASSEMBLY
9639
9640IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9641{
9642 RT_NOREF(pFpuState);
9643 RTUINT64U uSrc1 = { *puDst };
9644 RTUINT64U uSrc2 = { *puSrc };
9645 RTUINT64U uDst;
9646 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9647 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9648 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9649 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9650 *puDst = uDst.u;
9651}
9652
9653
9654IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9655{
9656 RT_NOREF(pFpuState);
9657 RTUINT128U uSrc1 = *puDst;
9658 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9659 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9660 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9661 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9662 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9663 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9664 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9665 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9666}
9667
9668#endif
9669
9670IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9671 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9672{
9673 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9674 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9675 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9676 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9677 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9678 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9679 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9680 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9681}
9682
9683IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9684 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9685{
9686 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9687 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9688 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9689 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9690 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9691 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9692 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9693 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9694 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9695 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9696 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9697 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9698 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9699 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9700 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9701 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9702}
9703
9704
9705
9706/*
9707 * PSUBD / VPSUBD.
9708 */
9709#ifdef IEM_WITHOUT_ASSEMBLY
9710
9711IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9712{
9713 RT_NOREF(pFpuState);
9714 RTUINT64U uSrc1 = { *puDst };
9715 RTUINT64U uSrc2 = { *puSrc };
9716 RTUINT64U uDst;
9717 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9718 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9719 *puDst = uDst.u;
9720}
9721
9722
9723IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9724{
9725 RT_NOREF(pFpuState);
9726 RTUINT128U uSrc1 = *puDst;
9727 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9728 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9729 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9730 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9731}
9732
9733#endif /* IEM_WITHOUT_ASSEMBLY */
9734
9735IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9736 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9737{
9738 RT_NOREF(pExtState);
9739 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9740 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9741 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9742 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9743}
9744
9745IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9746 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9747{
9748 RT_NOREF(pExtState);
9749 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9750 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9751 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9752 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9753 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9754 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9755 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9756 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9757}
9758
9759
9760/*
9761 * PSUBQ / VPSUBQ.
9762 */
9763#ifdef IEM_WITHOUT_ASSEMBLY
9764
9765IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9766{
9767 RT_NOREF(pFpuState);
9768 *puDst = *puDst - *puSrc;
9769}
9770
9771IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9772{
9773 RT_NOREF(pFpuState);
9774 RTUINT128U uSrc1 = *puDst;
9775 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9776 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9777}
9778
9779#endif
9780
9781IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9782 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9783{
9784 RT_NOREF(pExtState);
9785 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9786 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9787}
9788
9789IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9790 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9791{
9792 RT_NOREF(pExtState);
9793 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9794 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9795 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9796 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9797}
9798
9799
9800
9801/*
9802 * PMULLW / VPMULLW / PMULLD / VPMULLD
9803 */
9804#ifdef IEM_WITHOUT_ASSEMBLY
9805
9806IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9807{
9808 RT_NOREF(pFpuState);
9809 RTUINT64U uSrc1 = { *puDst };
9810 RTUINT64U uSrc2 = { *puSrc };
9811 RTUINT64U uDst;
9812 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9813 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9814 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9815 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9816 *puDst = uDst.u;
9817}
9818
9819
9820IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9821{
9822 RT_NOREF(pFpuState);
9823 RTUINT128U uSrc1 = *puDst;
9824 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9825 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9826 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9827 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9828 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9829 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9830 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9831 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9832}
9833
9834#endif
9835
9836IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9837{
9838 RTUINT128U uSrc1 = *puDst;
9839
9840 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9841 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9842 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9843 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9844 RT_NOREF(pFpuState);
9845}
9846
9847
9848IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9849{
9850 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9851 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9852 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9853 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9854 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9855 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9856 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9857 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9858}
9859
9860
9861IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9862{
9863 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9864 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9865 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9866 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9867 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9868 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9869 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9870 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9871 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9872 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9873 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9874 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9875 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9876 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9877 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9878 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9879}
9880
9881
9882IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9883{
9884 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9885 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9886 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9887 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9888}
9889
9890
9891IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9892{
9893 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9894 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9895 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9896 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9897 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9898 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9899 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9900 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9901}
9902
9903
9904/*
9905 * PMULHW / VPMULHW
9906 */
9907#ifdef IEM_WITHOUT_ASSEMBLY
9908
9909IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9910{
9911 RT_NOREF(pFpuState);
9912 RTUINT64U uSrc1 = { *puDst };
9913 RTUINT64U uSrc2 = { *puSrc };
9914 RTUINT64U uDst;
9915 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9916 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9917 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9918 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9919 *puDst = uDst.u;
9920}
9921
9922
9923IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9924{
9925 RT_NOREF(pFpuState);
9926 RTUINT128U uSrc1 = *puDst;
9927 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9928 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9929 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9930 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9931 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9932 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9933 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9934 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9935}
9936
9937#endif
9938
9939IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9940{
9941 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9942 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9943 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9944 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9945 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9946 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9947 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9948 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9949}
9950
9951
9952IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9953{
9954 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9955 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9956 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9957 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9958 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9959 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9960 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9961 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9962 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9963 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9964 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9965 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9966 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9967 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9968 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9969 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9970}
9971
9972
9973/*
9974 * PMULHUW / VPMULHUW
9975 */
9976#ifdef IEM_WITHOUT_ASSEMBLY
9977
9978IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9979{
9980 RTUINT64U uSrc1 = { *puDst };
9981 RTUINT64U uSrc2 = { *puSrc };
9982 RTUINT64U uDst;
9983 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9984 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9985 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9986 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9987 *puDst = uDst.u;
9988}
9989
9990
9991IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9992{
9993 RTUINT128U uSrc1 = *puDst;
9994 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9995 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9996 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9997 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9998 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9999 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
10000 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
10001 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
10002}
10003
10004#endif
10005
10006IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10007{
10008 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
10009 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
10010 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
10011 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
10012 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
10013 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
10014 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
10015 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
10016}
10017
10018
10019IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10020{
10021 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
10022 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
10023 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
10024 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
10025 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
10026 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
10027 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
10028 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
10029 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
10030 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
10031 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
10032 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
10033 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
10034 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
10035 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
10036 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
10037}
10038
10039
10040/*
10041 * PSRLW / VPSRLW
10042 */
10043#ifdef IEM_WITHOUT_ASSEMBLY
10044
10045IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10046{
10047 RTUINT64U uSrc1 = { *puDst };
10048 RTUINT64U uSrc2 = { *puSrc };
10049 RTUINT64U uDst;
10050
10051 if (uSrc2.au64[0] <= 15)
10052 {
10053 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
10054 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
10055 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
10056 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
10057 }
10058 else
10059 {
10060 uDst.au64[0] = 0;
10061 }
10062 *puDst = uDst.u;
10063}
10064
10065
10066IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10067{
10068 RTUINT64U uSrc1 = { *puDst };
10069 RTUINT64U uDst;
10070
10071 if (uShift <= 15)
10072 {
10073 uDst.au16[0] = uSrc1.au16[0] >> uShift;
10074 uDst.au16[1] = uSrc1.au16[1] >> uShift;
10075 uDst.au16[2] = uSrc1.au16[2] >> uShift;
10076 uDst.au16[3] = uSrc1.au16[3] >> uShift;
10077 }
10078 else
10079 {
10080 uDst.au64[0] = 0;
10081 }
10082 *puDst = uDst.u;
10083}
10084
10085
10086IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10087{
10088 RTUINT128U uSrc1 = *puDst;
10089
10090 if (puSrc->au64[0] <= 15)
10091 {
10092 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
10093 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
10094 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
10095 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
10096 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
10097 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
10098 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
10099 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
10100 }
10101 else
10102 {
10103 puDst->au64[0] = 0;
10104 puDst->au64[1] = 0;
10105 }
10106}
10107
10108IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10109{
10110 RTUINT128U uSrc1 = *puDst;
10111
10112 if (uShift <= 15)
10113 {
10114 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10115 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10116 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10117 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10118 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10119 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10120 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10121 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10122 }
10123 else
10124 {
10125 puDst->au64[0] = 0;
10126 puDst->au64[1] = 0;
10127 }
10128}
10129
10130#endif
10131
10132IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10133{
10134 RTUINT128U uSrc1 = *puSrc1;
10135
10136 if (uShift <= 15)
10137 {
10138 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10139 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10140 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10141 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10142 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10143 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10144 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10145 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10146 }
10147 else
10148 {
10149 puDst->au64[0] = 0;
10150 puDst->au64[1] = 0;
10151 }
10152}
10153
10154IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10155{
10156 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10157}
10158
10159IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10160{
10161 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, uShift);
10162}
10163
10164IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10165{
10166 RTUINT256U uSrc1 = *puSrc1;
10167
10168 if (uShift <= 15)
10169 {
10170 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10171 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10172 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10173 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10174 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10175 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10176 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10177 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10178 puDst->au16[8] = uSrc1.au16[8] >> uShift;
10179 puDst->au16[9] = uSrc1.au16[9] >> uShift;
10180 puDst->au16[10] = uSrc1.au16[10] >> uShift;
10181 puDst->au16[11] = uSrc1.au16[11] >> uShift;
10182 puDst->au16[12] = uSrc1.au16[12] >> uShift;
10183 puDst->au16[13] = uSrc1.au16[13] >> uShift;
10184 puDst->au16[14] = uSrc1.au16[14] >> uShift;
10185 puDst->au16[15] = uSrc1.au16[15] >> uShift;
10186 }
10187 else
10188 {
10189 puDst->au64[0] = 0;
10190 puDst->au64[1] = 0;
10191 puDst->au64[2] = 0;
10192 puDst->au64[3] = 0;
10193 }
10194}
10195
10196IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10197{
10198 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, uShift);
10199}
10200
10201IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10202{
10203 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10204}
10205
10206
10207/*
10208 * PSRAW / VPSRAW
10209 */
10210#ifdef IEM_WITHOUT_ASSEMBLY
10211
10212IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10213{
10214 RTUINT64U uSrc1 = { *puDst };
10215 RTUINT64U uSrc2 = { *puSrc };
10216 RTUINT64U uDst;
10217 uint8_t uShift;
10218
10219 uShift = RT_MIN(15, uSrc2.au64[0]);
10220
10221 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10222 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10223 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10224 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10225
10226 *puDst = uDst.u;
10227}
10228
10229
10230IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10231{
10232 RTUINT64U uSrc1 = { *puDst };
10233 RTUINT64U uDst;
10234
10235 uShift = RT_MIN(15, uShift);
10236
10237 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10238 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10239 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10240 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10241
10242 *puDst = uDst.u;
10243}
10244
10245
10246IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10247{
10248 RTUINT128U uSrc1 = *puDst;
10249 uint8_t uShift;
10250
10251 uShift = RT_MIN(15, puSrc->au64[0]);
10252
10253 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10254 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10255 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10256 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10257 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10258 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10259 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10260 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10261}
10262
10263IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10264{
10265 RTUINT128U uSrc1 = *puDst;
10266
10267 uShift = RT_MIN(15, uShift);
10268
10269 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10270 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10271 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10272 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10273 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10274 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10275 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10276 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10277}
10278
10279#endif
10280
10281IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10282{
10283 RTUINT128U uSrc1 = *puSrc1;
10284
10285 uShift = RT_MIN(15, uShift);
10286
10287 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10288 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10289 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10290 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10291 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10292 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10293 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10294 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10295}
10296
10297IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10298{
10299 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10300}
10301
10302IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10303{
10304 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, uShift);
10305}
10306
10307IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10308{
10309 RTUINT256U uSrc1 = *puSrc1;
10310
10311 uShift = RT_MIN(15, uShift);
10312
10313 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10314 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10315 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10316 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10317 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10318 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10319 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10320 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10321 puDst->ai16[8] = uSrc1.ai16[8] >> uShift;
10322 puDst->ai16[9] = uSrc1.ai16[9] >> uShift;
10323 puDst->ai16[10] = uSrc1.ai16[10] >> uShift;
10324 puDst->ai16[11] = uSrc1.ai16[11] >> uShift;
10325 puDst->ai16[12] = uSrc1.ai16[12] >> uShift;
10326 puDst->ai16[13] = uSrc1.ai16[13] >> uShift;
10327 puDst->ai16[14] = uSrc1.ai16[14] >> uShift;
10328 puDst->ai16[15] = uSrc1.ai16[15] >> uShift;
10329}
10330
10331IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10332{
10333 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, uShift);
10334}
10335
10336IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10337{
10338 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10339}
10340
10341
10342/*
10343 * PSLLW / VPSLLW
10344 */
10345#ifdef IEM_WITHOUT_ASSEMBLY
10346
10347IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10348{
10349 RTUINT64U uSrc1 = { *puDst };
10350 RTUINT64U uSrc2 = { *puSrc };
10351 RTUINT64U uDst;
10352
10353 if (uSrc2.au64[0] <= 15)
10354 {
10355 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10356 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10357 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10358 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10359 }
10360 else
10361 {
10362 uDst.au64[0] = 0;
10363 }
10364 *puDst = uDst.u;
10365}
10366
10367
10368IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10369{
10370 RTUINT64U uSrc1 = { *puDst };
10371 RTUINT64U uDst;
10372
10373 if (uShift <= 15)
10374 {
10375 uDst.au16[0] = uSrc1.au16[0] << uShift;
10376 uDst.au16[1] = uSrc1.au16[1] << uShift;
10377 uDst.au16[2] = uSrc1.au16[2] << uShift;
10378 uDst.au16[3] = uSrc1.au16[3] << uShift;
10379 }
10380 else
10381 {
10382 uDst.au64[0] = 0;
10383 }
10384 *puDst = uDst.u;
10385}
10386
10387
10388IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10389{
10390 RTUINT128U uSrc1 = *puDst;
10391
10392 if (puSrc->au64[0] <= 15)
10393 {
10394 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10395 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10396 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10397 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10398 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10399 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10400 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10401 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10402 }
10403 else
10404 {
10405 puDst->au64[0] = 0;
10406 puDst->au64[1] = 0;
10407 }
10408}
10409
10410IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10411{
10412 RTUINT128U uSrc1 = *puDst;
10413
10414 if (uShift <= 15)
10415 {
10416 puDst->au16[0] = uSrc1.au16[0] << uShift;
10417 puDst->au16[1] = uSrc1.au16[1] << uShift;
10418 puDst->au16[2] = uSrc1.au16[2] << uShift;
10419 puDst->au16[3] = uSrc1.au16[3] << uShift;
10420 puDst->au16[4] = uSrc1.au16[4] << uShift;
10421 puDst->au16[5] = uSrc1.au16[5] << uShift;
10422 puDst->au16[6] = uSrc1.au16[6] << uShift;
10423 puDst->au16[7] = uSrc1.au16[7] << uShift;
10424 }
10425 else
10426 {
10427 puDst->au64[0] = 0;
10428 puDst->au64[1] = 0;
10429 }
10430}
10431
10432#endif
10433
10434IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10435{
10436 RTUINT128U uSrc1 = *puSrc1;
10437
10438 if (uShift <= 15)
10439 {
10440 puDst->au16[0] = uSrc1.au16[0] << uShift;
10441 puDst->au16[1] = uSrc1.au16[1] << uShift;
10442 puDst->au16[2] = uSrc1.au16[2] << uShift;
10443 puDst->au16[3] = uSrc1.au16[3] << uShift;
10444 puDst->au16[4] = uSrc1.au16[4] << uShift;
10445 puDst->au16[5] = uSrc1.au16[5] << uShift;
10446 puDst->au16[6] = uSrc1.au16[6] << uShift;
10447 puDst->au16[7] = uSrc1.au16[7] << uShift;
10448 }
10449 else
10450 {
10451 puDst->au64[0] = 0;
10452 puDst->au64[1] = 0;
10453 }
10454}
10455
10456IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10457{
10458 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, puSrc2->au8[0]);
10459}
10460
10461IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10462{
10463 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, uShift);
10464}
10465
10466IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10467{
10468 RTUINT256U uSrc1 = *puSrc1;
10469
10470 if (uShift <= 15)
10471 {
10472 puDst->au16[0] = uSrc1.au16[0] << uShift;
10473 puDst->au16[1] = uSrc1.au16[1] << uShift;
10474 puDst->au16[2] = uSrc1.au16[2] << uShift;
10475 puDst->au16[3] = uSrc1.au16[3] << uShift;
10476 puDst->au16[4] = uSrc1.au16[4] << uShift;
10477 puDst->au16[5] = uSrc1.au16[5] << uShift;
10478 puDst->au16[6] = uSrc1.au16[6] << uShift;
10479 puDst->au16[7] = uSrc1.au16[7] << uShift;
10480 puDst->au16[8] = uSrc1.au16[8] << uShift;
10481 puDst->au16[9] = uSrc1.au16[9] << uShift;
10482 puDst->au16[10] = uSrc1.au16[10] << uShift;
10483 puDst->au16[11] = uSrc1.au16[11] << uShift;
10484 puDst->au16[12] = uSrc1.au16[12] << uShift;
10485 puDst->au16[13] = uSrc1.au16[13] << uShift;
10486 puDst->au16[14] = uSrc1.au16[14] << uShift;
10487 puDst->au16[15] = uSrc1.au16[15] << uShift;
10488 }
10489 else
10490 {
10491 puDst->au64[0] = 0;
10492 puDst->au64[1] = 0;
10493 puDst->au64[2] = 0;
10494 puDst->au64[3] = 0;
10495 }
10496}
10497
10498IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10499{
10500 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, puSrc2->au8[0]);
10501}
10502
10503IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10504{
10505 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, uShift);
10506}
10507
10508/*
10509 * PSRLD / VPSRLD
10510 */
10511#ifdef IEM_WITHOUT_ASSEMBLY
10512
10513IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10514{
10515 RTUINT64U uSrc1 = { *puDst };
10516 RTUINT64U uSrc2 = { *puSrc };
10517 RTUINT64U uDst;
10518
10519 if (uSrc2.au64[0] <= 31)
10520 {
10521 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10522 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10523 }
10524 else
10525 {
10526 uDst.au64[0] = 0;
10527 }
10528 *puDst = uDst.u;
10529}
10530
10531
10532IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10533{
10534 RTUINT64U uSrc1 = { *puDst };
10535 RTUINT64U uDst;
10536
10537 if (uShift <= 31)
10538 {
10539 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10540 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10541 }
10542 else
10543 {
10544 uDst.au64[0] = 0;
10545 }
10546 *puDst = uDst.u;
10547}
10548
10549
10550IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10551{
10552 RTUINT128U uSrc1 = *puDst;
10553
10554 if (puSrc->au64[0] <= 31)
10555 {
10556 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10557 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10558 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10559 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10560 }
10561 else
10562 {
10563 puDst->au64[0] = 0;
10564 puDst->au64[1] = 0;
10565 }
10566}
10567
10568IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10569{
10570 RTUINT128U uSrc1 = *puDst;
10571
10572 if (uShift <= 31)
10573 {
10574 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10575 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10576 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10577 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10578 }
10579 else
10580 {
10581 puDst->au64[0] = 0;
10582 puDst->au64[1] = 0;
10583 }
10584}
10585
10586#endif
10587
10588IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10589{
10590 RTUINT128U uSrc1 = *puSrc1;
10591
10592 if (uShift <= 31)
10593 {
10594 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10595 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10596 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10597 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10598 }
10599 else
10600 {
10601 puDst->au64[0] = 0;
10602 puDst->au64[1] = 0;
10603 }
10604}
10605
10606IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10607{
10608 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, uShift);
10609}
10610
10611IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10612{
10613 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10614}
10615
10616IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10617{
10618 RTUINT256U uSrc1 = *puSrc1;
10619
10620 if (uShift <= 31)
10621 {
10622 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10623 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10624 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10625 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10626 puDst->au32[4] = uSrc1.au32[4] >> uShift;
10627 puDst->au32[5] = uSrc1.au32[5] >> uShift;
10628 puDst->au32[6] = uSrc1.au32[6] >> uShift;
10629 puDst->au32[7] = uSrc1.au32[7] >> uShift;
10630 }
10631 else
10632 {
10633 puDst->au64[0] = 0;
10634 puDst->au64[1] = 0;
10635 puDst->au64[2] = 0;
10636 puDst->au64[3] = 0;
10637 }
10638}
10639
10640IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10641{
10642 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10643}
10644
10645IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10646{
10647 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, uShift);
10648}
10649
10650
10651/*
10652 * PSRAD / VPSRAD
10653 */
10654#ifdef IEM_WITHOUT_ASSEMBLY
10655
10656IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10657{
10658 RTUINT64U uSrc1 = { *puDst };
10659 RTUINT64U uSrc2 = { *puSrc };
10660 RTUINT64U uDst;
10661 uint8_t uShift;
10662
10663 uShift = RT_MIN(31, uSrc2.au64[0]);
10664
10665 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10666 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10667
10668 *puDst = uDst.u;
10669}
10670
10671
10672IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10673{
10674 RTUINT64U uSrc1 = { *puDst };
10675 RTUINT64U uDst;
10676
10677 uShift = RT_MIN(31, uShift);
10678
10679 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10680 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10681
10682 *puDst = uDst.u;
10683}
10684
10685
10686IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10687{
10688 RTUINT128U uSrc1 = *puDst;
10689 uint8_t uShift;
10690
10691 uShift = RT_MIN(31, puSrc->au64[0]);
10692
10693 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10694 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10695 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10696 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10697}
10698
10699IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10700{
10701 RTUINT128U uSrc1 = *puDst;
10702
10703 uShift = RT_MIN(31, uShift);
10704
10705 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10706 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10707 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10708 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10709}
10710
10711#endif
10712
10713IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10714{
10715 RTUINT128U uSrc1 = *puSrc1;
10716
10717 uShift = RT_MIN(31, uShift);
10718
10719 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10720 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10721 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10722 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10723}
10724
10725IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10726{
10727 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, uShift);
10728}
10729
10730IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10731{
10732 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10733}
10734
10735IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10736{
10737 RTUINT256U uSrc1 = *puSrc1;
10738
10739 uShift = RT_MIN(31, uShift);
10740
10741 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10742 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10743 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10744 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10745 puDst->ai32[4] = uSrc1.ai32[4] >> uShift;
10746 puDst->ai32[5] = uSrc1.ai32[5] >> uShift;
10747 puDst->ai32[6] = uSrc1.ai32[6] >> uShift;
10748 puDst->ai32[7] = uSrc1.ai32[7] >> uShift;
10749}
10750
10751IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10752{
10753 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10754}
10755
10756IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10757{
10758 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, uShift);
10759}
10760
10761
10762/*
10763 * PSLLD / VPSLLD
10764 */
10765#ifdef IEM_WITHOUT_ASSEMBLY
10766
10767IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10768{
10769 RTUINT64U uSrc1 = { *puDst };
10770 RTUINT64U uSrc2 = { *puSrc };
10771 RTUINT64U uDst;
10772
10773 if (uSrc2.au64[0] <= 31)
10774 {
10775 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10776 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10777 }
10778 else
10779 {
10780 uDst.au64[0] = 0;
10781 }
10782 *puDst = uDst.u;
10783}
10784
10785
10786IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10787{
10788 RTUINT64U uSrc1 = { *puDst };
10789 RTUINT64U uDst;
10790
10791 if (uShift <= 31)
10792 {
10793 uDst.au32[0] = uSrc1.au32[0] << uShift;
10794 uDst.au32[1] = uSrc1.au32[1] << uShift;
10795 }
10796 else
10797 {
10798 uDst.au64[0] = 0;
10799 }
10800 *puDst = uDst.u;
10801}
10802
10803
10804IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10805{
10806 RTUINT128U uSrc1 = *puDst;
10807
10808 if (puSrc->au64[0] <= 31)
10809 {
10810 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10811 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10812 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10813 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10814 }
10815 else
10816 {
10817 puDst->au64[0] = 0;
10818 puDst->au64[1] = 0;
10819 }
10820}
10821
10822IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10823{
10824 RTUINT128U uSrc1 = *puDst;
10825
10826 if (uShift <= 31)
10827 {
10828 puDst->au32[0] = uSrc1.au32[0] << uShift;
10829 puDst->au32[1] = uSrc1.au32[1] << uShift;
10830 puDst->au32[2] = uSrc1.au32[2] << uShift;
10831 puDst->au32[3] = uSrc1.au32[3] << uShift;
10832 }
10833 else
10834 {
10835 puDst->au64[0] = 0;
10836 puDst->au64[1] = 0;
10837 }
10838}
10839
10840#endif
10841
10842IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10843{
10844 RTUINT128U uSrc1 = *puSrc1;
10845
10846 if (uShift <= 31)
10847 {
10848 puDst->au32[0] = uSrc1.au32[0] << uShift;
10849 puDst->au32[1] = uSrc1.au32[1] << uShift;
10850 puDst->au32[2] = uSrc1.au32[2] << uShift;
10851 puDst->au32[3] = uSrc1.au32[3] << uShift;
10852 }
10853 else
10854 {
10855 puDst->au64[0] = 0;
10856 puDst->au64[1] = 0;
10857 }
10858}
10859
10860IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10861{
10862 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, uShift);
10863}
10864
10865IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10866{
10867 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, puSrc2->au8[0]);
10868}
10869
10870IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10871{
10872 RTUINT256U uSrc1 = *puSrc1;
10873
10874 if (uShift <= 31)
10875 {
10876 puDst->au32[0] = uSrc1.au32[0] << uShift;
10877 puDst->au32[1] = uSrc1.au32[1] << uShift;
10878 puDst->au32[2] = uSrc1.au32[2] << uShift;
10879 puDst->au32[3] = uSrc1.au32[3] << uShift;
10880 puDst->au32[4] = uSrc1.au32[4] << uShift;
10881 puDst->au32[5] = uSrc1.au32[5] << uShift;
10882 puDst->au32[6] = uSrc1.au32[6] << uShift;
10883 puDst->au32[7] = uSrc1.au32[7] << uShift;
10884 }
10885 else
10886 {
10887 puDst->au64[0] = 0;
10888 puDst->au64[1] = 0;
10889 puDst->au64[2] = 0;
10890 puDst->au64[3] = 0;
10891 }
10892}
10893
10894IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10895{
10896 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, puSrc2->au8[0]);
10897}
10898
10899IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10900{
10901 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, uShift);
10902}
10903
10904
10905/*
10906 * PSRLQ / VPSRLQ
10907 */
10908#ifdef IEM_WITHOUT_ASSEMBLY
10909
10910IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10911{
10912 RTUINT64U uSrc1 = { *puDst };
10913 RTUINT64U uSrc2 = { *puSrc };
10914 RTUINT64U uDst;
10915
10916 if (uSrc2.au64[0] <= 63)
10917 {
10918 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10919 }
10920 else
10921 {
10922 uDst.au64[0] = 0;
10923 }
10924 *puDst = uDst.u;
10925}
10926
10927
10928IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10929{
10930 RTUINT64U uSrc1 = { *puDst };
10931 RTUINT64U uDst;
10932
10933 if (uShift <= 63)
10934 {
10935 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10936 }
10937 else
10938 {
10939 uDst.au64[0] = 0;
10940 }
10941 *puDst = uDst.u;
10942}
10943
10944
10945IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10946{
10947 RTUINT128U uSrc1 = *puDst;
10948
10949 if (puSrc->au64[0] <= 63)
10950 {
10951 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10952 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10953 }
10954 else
10955 {
10956 puDst->au64[0] = 0;
10957 puDst->au64[1] = 0;
10958 }
10959}
10960
10961IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10962{
10963 RTUINT128U uSrc1 = *puDst;
10964
10965 if (uShift <= 63)
10966 {
10967 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10968 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10969 }
10970 else
10971 {
10972 puDst->au64[0] = 0;
10973 puDst->au64[1] = 0;
10974 }
10975}
10976
10977#endif
10978
10979IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10980{
10981 RTUINT128U uSrc1 = *puSrc1;
10982
10983 if (uShift <= 63)
10984 {
10985 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10986 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10987 }
10988 else
10989 {
10990 puDst->au64[0] = 0;
10991 puDst->au64[1] = 0;
10992 }
10993}
10994
10995IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10996{
10997 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, uShift);
10998}
10999
11000IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11001{
11002 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
11003}
11004
11005IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11006{
11007 RTUINT256U uSrc1 = *puSrc1;
11008
11009 if (uShift <= 63)
11010 {
11011 puDst->au64[0] = uSrc1.au64[0] >> uShift;
11012 puDst->au64[1] = uSrc1.au64[1] >> uShift;
11013 puDst->au64[2] = uSrc1.au64[2] >> uShift;
11014 puDst->au64[3] = uSrc1.au64[3] >> uShift;
11015 }
11016 else
11017 {
11018 puDst->au64[0] = 0;
11019 puDst->au64[1] = 0;
11020 puDst->au64[2] = 0;
11021 puDst->au64[3] = 0;
11022 }
11023}
11024
11025IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11026{
11027 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
11028}
11029
11030IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11031{
11032 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, uShift);
11033}
11034
11035
11036/*
11037 * PSLLQ / VPSLLQ
11038 */
11039#ifdef IEM_WITHOUT_ASSEMBLY
11040
11041IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11042{
11043 RTUINT64U uSrc1 = { *puDst };
11044 RTUINT64U uSrc2 = { *puSrc };
11045 RTUINT64U uDst;
11046
11047 if (uSrc2.au64[0] <= 63)
11048 {
11049 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
11050 }
11051 else
11052 {
11053 uDst.au64[0] = 0;
11054 }
11055 *puDst = uDst.u;
11056}
11057
11058
11059IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
11060{
11061 RTUINT64U uSrc1 = { *puDst };
11062 RTUINT64U uDst;
11063
11064 if (uShift <= 63)
11065 {
11066 uDst.au64[0] = uSrc1.au64[0] << uShift;
11067 }
11068 else
11069 {
11070 uDst.au64[0] = 0;
11071 }
11072 *puDst = uDst.u;
11073}
11074
11075
11076IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11077{
11078 RTUINT128U uSrc1 = *puDst;
11079
11080 if (puSrc->au64[0] <= 63)
11081 {
11082 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
11083 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
11084 }
11085 else
11086 {
11087 puDst->au64[0] = 0;
11088 puDst->au64[1] = 0;
11089 }
11090}
11091
11092IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11093{
11094 RTUINT128U uSrc1 = *puDst;
11095
11096 if (uShift <= 63)
11097 {
11098 puDst->au64[0] = uSrc1.au64[0] << uShift;
11099 puDst->au64[1] = uSrc1.au64[1] << uShift;
11100 }
11101 else
11102 {
11103 puDst->au64[0] = 0;
11104 puDst->au64[1] = 0;
11105 }
11106}
11107
11108#endif
11109
11110IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11111{
11112 RTUINT128U uSrc1 = *puSrc1;
11113
11114 if (uShift <= 63)
11115 {
11116 puDst->au64[0] = uSrc1.au64[0] << uShift;
11117 puDst->au64[1] = uSrc1.au64[1] << uShift;
11118 }
11119 else
11120 {
11121 puDst->au64[0] = 0;
11122 puDst->au64[1] = 0;
11123 }
11124}
11125
11126IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11127{
11128 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, puSrc2->au8[0]);
11129}
11130
11131IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11132{
11133 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, uShift);
11134}
11135
11136IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11137{
11138 RTUINT256U uSrc1 = *puSrc1;
11139
11140 if (uShift <= 63)
11141 {
11142 puDst->au64[0] = uSrc1.au64[0] << uShift;
11143 puDst->au64[1] = uSrc1.au64[1] << uShift;
11144 puDst->au64[2] = uSrc1.au64[2] << uShift;
11145 puDst->au64[3] = uSrc1.au64[3] << uShift;
11146 }
11147 else
11148 {
11149 puDst->au64[0] = 0;
11150 puDst->au64[1] = 0;
11151 puDst->au64[2] = 0;
11152 puDst->au64[3] = 0;
11153 }
11154}
11155
11156IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11157{
11158 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, puSrc2->au8[0]);
11159}
11160
11161IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11162{
11163 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, uShift);
11164}
11165
11166
11167/*
11168 * PSRLDQ / VPSRLDQ
11169 */
11170#ifdef IEM_WITHOUT_ASSEMBLY
11171
11172IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11173{
11174 RTUINT128U uSrc1 = *puDst;
11175
11176 if (uShift < 16)
11177 {
11178 int i;
11179
11180 for (i = 0; i < 16 - uShift; ++i)
11181 puDst->au8[i] = uSrc1.au8[i + uShift];
11182 for (i = 16 - uShift; i < 16; ++i)
11183 puDst->au8[i] = 0;
11184 }
11185 else
11186 {
11187 puDst->au64[0] = 0;
11188 puDst->au64[1] = 0;
11189 }
11190}
11191
11192#endif
11193
11194
11195/*
11196 * PSLLDQ / VPSLLDQ
11197 */
11198#ifdef IEM_WITHOUT_ASSEMBLY
11199
11200IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11201{
11202 RTUINT128U uSrc1 = *puDst;
11203
11204 if (uShift < 16)
11205 {
11206 int i;
11207
11208 for (i = 0; i < uShift; ++i)
11209 puDst->au8[i] = 0;
11210 for (i = uShift; i < 16; ++i)
11211 puDst->au8[i] = uSrc1.au8[i - uShift];
11212 }
11213 else
11214 {
11215 puDst->au64[0] = 0;
11216 puDst->au64[1] = 0;
11217 }
11218}
11219
11220#endif
11221
11222
11223/*
11224 * PMADDWD / VPMADDWD
11225 */
11226#ifdef IEM_WITHOUT_ASSEMBLY
11227
11228IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11229{
11230 RTUINT64U uSrc1 = { *puDst };
11231 RTUINT64U uSrc2 = { *puSrc };
11232 RTUINT64U uDst;
11233
11234 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11235 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11236 *puDst = uDst.u;
11237 RT_NOREF(pFpuState);
11238}
11239
11240
11241IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11242{
11243 RTUINT128U uSrc1 = *puDst;
11244
11245 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11246 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11247 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11248 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11249 RT_NOREF(pFpuState);
11250}
11251
11252#endif
11253
11254
11255/*
11256 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
11257 */
11258#ifdef IEM_WITHOUT_ASSEMBLY
11259
11260IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11261{
11262 RTUINT64U uSrc1 = { *puDst };
11263 RTUINT64U uSrc2 = { *puSrc };
11264 RTUINT64U uDst;
11265
11266 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
11267 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
11268 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
11269 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
11270 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
11271 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
11272 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
11273 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
11274 *puDst = uDst.u;
11275 RT_NOREF(pFpuState);
11276}
11277
11278
11279IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11280{
11281 RTUINT128U uSrc1 = *puDst;
11282
11283 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
11284 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
11285 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
11286 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
11287 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
11288 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
11289 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
11290 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
11291 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
11292 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
11293 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
11294 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
11295 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
11296 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
11297 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
11298 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
11299 RT_NOREF(pFpuState);
11300}
11301
11302#endif
11303
11304
11305IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11306{
11307 RTUINT128U uSrc1 = *puDst;
11308
11309 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
11310 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
11311 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
11312 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
11313 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
11314 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
11315 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
11316 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
11317 RT_NOREF(pFpuState);
11318}
11319
11320
11321IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11322{
11323 RTUINT128U uSrc1 = *puDst;
11324
11325 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
11326 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
11327 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
11328 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
11329 RT_NOREF(pFpuState);
11330}
11331
11332
11333IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11334 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11335{
11336 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11337 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11338 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11339 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11340 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11341 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11342 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11343 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11344 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11345 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11346 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11347 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11348 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11349 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11350 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11351 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11352 RT_NOREF(pExtState);
11353}
11354
11355
11356IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11357 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11358{
11359 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11360 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11361 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11362 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11363 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11364 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11365 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11366 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11367 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11368 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11369 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11370 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11371 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11372 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11373 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11374 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11375 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
11376 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
11377 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
11378 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
11379 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
11380 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
11381 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
11382 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
11383 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
11384 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
11385 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
11386 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
11387 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
11388 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
11389 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
11390 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
11391 RT_NOREF(pExtState);
11392}
11393
11394
11395IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11396 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11397{
11398 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11399 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11400 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11401 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11402 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11403 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11404 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11405 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11406 RT_NOREF(pExtState);
11407}
11408
11409
11410IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11411 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11412{
11413 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11414 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11415 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11416 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11417 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11418 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11419 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11420 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11421 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11422 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11423 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
11424 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
11425 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
11426 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
11427 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
11428 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
11429 RT_NOREF(pExtState);
11430}
11431
11432
11433IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11434 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11435{
11436 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11437 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11438 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11439 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11440 RT_NOREF(pExtState);
11441}
11442
11443
11444IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11445 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11446{
11447 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11448 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11449 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11450 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11451 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11452 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11453 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11454 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11455 RT_NOREF(pExtState);
11456}
11457
11458
11459/*
11460 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11461 */
11462#ifdef IEM_WITHOUT_ASSEMBLY
11463
11464IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11465{
11466 RTUINT64U uSrc1 = { *puDst };
11467 RTUINT64U uSrc2 = { *puSrc };
11468 RTUINT64U uDst;
11469
11470 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11471 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11472 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11473 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11474 *puDst = uDst.u;
11475 RT_NOREF(pFpuState);
11476}
11477
11478
11479IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11480{
11481 RTUINT128U uSrc1 = *puDst;
11482
11483 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11484 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11485 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11486 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11487 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11488 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11489 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11490 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11491 RT_NOREF(pFpuState);
11492}
11493
11494#endif
11495
11496IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11497{
11498 RTUINT128U uSrc1 = *puDst;
11499
11500 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11501 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11502 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11503 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11504 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11505 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11506 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11507 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11508 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11509 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11510 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11511 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11512 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11513 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11514 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11515 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11516 RT_NOREF(pFpuState);
11517}
11518
11519
11520IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11521{
11522 RTUINT128U uSrc1 = *puDst;
11523
11524 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11525 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11526 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11527 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11528 RT_NOREF(pFpuState);
11529}
11530
11531
11532IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11533 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11534{
11535 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11536 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11537 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11538 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11539 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11540 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11541 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11542 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11543 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11544 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11545 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11546 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11547 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11548 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11549 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11550 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11551 RT_NOREF(pExtState);
11552}
11553
11554
11555IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11556 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11557{
11558 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11559 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11560 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11561 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11562 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11563 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11564 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11565 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11566 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11567 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11568 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11569 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11570 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11571 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11572 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11573 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11574 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11575 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11576 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11577 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11578 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11579 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11580 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11581 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11582 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11583 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11584 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11585 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11586 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11587 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11588 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11589 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11590 RT_NOREF(pExtState);
11591}
11592
11593
11594IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11595 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11596{
11597 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11598 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11599 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11600 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11601 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11602 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11603 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11604 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11605 RT_NOREF(pExtState);
11606}
11607
11608
11609IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11610 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11611{
11612 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11613 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11614 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11615 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11616 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11617 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11618 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11619 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11620 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11621 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11622 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11623 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11624 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11625 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11626 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11627 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11628 RT_NOREF(pExtState);
11629}
11630
11631
11632IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11633 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11634{
11635 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11636 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11637 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11638 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11639 RT_NOREF(pExtState);
11640}
11641
11642
11643IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11644 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11645{
11646 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11647 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11648 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11649 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11650 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11651 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11652 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11653 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11654 RT_NOREF(pExtState);
11655}
11656
11657
11658/*
11659 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11660 */
11661#ifdef IEM_WITHOUT_ASSEMBLY
11662
11663IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11664{
11665 RTUINT64U uSrc1 = { *puDst };
11666 RTUINT64U uSrc2 = { *puSrc };
11667 RTUINT64U uDst;
11668
11669 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11670 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11671 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11672 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11673 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11674 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11675 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11676 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11677 *puDst = uDst.u;
11678 RT_NOREF(pFpuState);
11679}
11680
11681
11682IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11683{
11684 RTUINT128U uSrc1 = *puDst;
11685
11686 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11687 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11688 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11689 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11690 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11691 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11692 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11693 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11694 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11695 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11696 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11697 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11698 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11699 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11700 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11701 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11702 RT_NOREF(pFpuState);
11703}
11704
11705#endif
11706
11707IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11708{
11709 RTUINT128U uSrc1 = *puDst;
11710
11711 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11712 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11713 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11714 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11715 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11716 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11717 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11718 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11719 RT_NOREF(pFpuState);
11720}
11721
11722
11723IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11724{
11725 RTUINT128U uSrc1 = *puDst;
11726
11727 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11728 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11729 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11730 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11731 RT_NOREF(pFpuState);
11732}
11733
11734
11735IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11736 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11737{
11738 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11739 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11740 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11741 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11742 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11743 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11744 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11745 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11746 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11747 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11748 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11749 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11750 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11751 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11752 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11753 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11754 RT_NOREF(pExtState);
11755}
11756
11757
11758IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11759 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11760{
11761 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11762 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11763 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11764 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11765 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11766 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11767 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11768 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11769 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11770 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11771 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11772 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11773 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11774 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11775 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11776 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11777 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11778 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11779 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11780 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11781 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11782 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11783 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11784 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11785 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11786 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11787 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11788 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11789 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11790 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11791 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11792 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11793 RT_NOREF(pExtState);
11794}
11795
11796
11797IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11798 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11799{
11800 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11801 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11802 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11803 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11804 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11805 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11806 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11807 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11808 RT_NOREF(pExtState);
11809}
11810
11811
11812IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11813 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11814{
11815 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11816 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11817 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11818 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11819 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11820 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11821 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11822 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11823 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11824 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11825 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11826 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11827 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11828 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11829 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11830 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11831 RT_NOREF(pExtState);
11832}
11833
11834
11835IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11836 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11837{
11838 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11839 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11840 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11841 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11842 RT_NOREF(pExtState);
11843}
11844
11845
11846IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11847 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11848{
11849 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11850 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11851 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11852 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11853 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11854 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11855 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11856 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11857 RT_NOREF(pExtState);
11858}
11859
11860
11861/*
11862 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11863 */
11864#ifdef IEM_WITHOUT_ASSEMBLY
11865
11866IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11867{
11868 RTUINT64U uSrc1 = { *puDst };
11869 RTUINT64U uSrc2 = { *puSrc };
11870 RTUINT64U uDst;
11871
11872 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11873 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11874 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11875 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11876 *puDst = uDst.u;
11877 RT_NOREF(pFpuState);
11878}
11879
11880
11881IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11882{
11883 RTUINT128U uSrc1 = *puDst;
11884
11885 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11886 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11887 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11888 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11889 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11890 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11891 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11892 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11893 RT_NOREF(pFpuState);
11894}
11895
11896#endif
11897
11898IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11899{
11900 RTUINT128U uSrc1 = *puDst;
11901
11902 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11903 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11904 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11905 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11906 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11907 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11908 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11909 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11910 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11911 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11912 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11913 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11914 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11915 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11916 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
11917 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
11918 RT_NOREF(pFpuState);
11919}
11920
11921
11922IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11923{
11924 RTUINT128U uSrc1 = *puDst;
11925
11926 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11927 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11928 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11929 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11930 RT_NOREF(pFpuState);
11931}
11932
11933
11934IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11935 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11936{
11937 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11938 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11939 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11940 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11941 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11942 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11943 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11944 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11945 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11946 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11947 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11948 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11949 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11950 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11951 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11952 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11953 RT_NOREF(pExtState);
11954}
11955
11956
11957IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11958 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11959{
11960 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11961 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11962 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11963 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11964 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11965 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11966 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11967 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11968 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11969 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11970 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11971 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11972 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11973 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11974 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11975 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11976 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11977 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11978 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11979 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11980 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11981 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11982 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11983 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11984 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11985 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11986 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11987 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11988 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11989 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11990 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11991 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11992 RT_NOREF(pExtState);
11993}
11994
11995
11996IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11997 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11998{
11999 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12000 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12001 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12002 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12003 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12004 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12005 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12006 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12007 RT_NOREF(pExtState);
12008}
12009
12010
12011IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12012 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12013{
12014 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12015 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12016 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12017 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12018 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12019 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12020 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12021 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12022 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
12023 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
12024 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
12025 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
12026 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
12027 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
12028 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
12029 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
12030 RT_NOREF(pExtState);
12031}
12032
12033
12034IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12035 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12036{
12037 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12038 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12039 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12040 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12041 RT_NOREF(pExtState);
12042}
12043
12044
12045IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12046 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12047{
12048 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12049 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12050 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12051 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12052 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
12053 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
12054 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
12055 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
12056 RT_NOREF(pExtState);
12057}
12058
12059
12060/*
12061 * PAVGB / VPAVGB / PAVGW / VPAVGW
12062 */
12063#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
12064#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
12065
12066#ifdef IEM_WITHOUT_ASSEMBLY
12067
12068IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12069{
12070 RTUINT64U uSrc1 = { *puDst };
12071 RTUINT64U uSrc2 = { *puSrc };
12072 RTUINT64U uDst;
12073
12074 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
12075 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
12076 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
12077 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
12078 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
12079 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
12080 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
12081 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
12082 *puDst = uDst.u;
12083}
12084
12085
12086IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12087{
12088 RTUINT128U uSrc1 = *puDst;
12089
12090 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12091 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12092 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12093 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12094 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12095 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12096 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12097 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12098 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12099 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12100 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12101 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12102 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12103 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12104 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12105 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12106}
12107
12108
12109IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12110{
12111 RTUINT64U uSrc1 = { *puDst };
12112 RTUINT64U uSrc2 = { *puSrc };
12113 RTUINT64U uDst;
12114
12115 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
12116 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
12117 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
12118 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
12119 *puDst = uDst.u;
12120}
12121
12122
12123IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12124{
12125 RTUINT128U uSrc1 = *puDst;
12126
12127 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
12128 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
12129 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
12130 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
12131 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
12132 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
12133 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
12134 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
12135}
12136
12137#endif
12138
12139IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12140{
12141 RTUINT128U uSrc1 = *puDst;
12142
12143 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12144 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12145 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12146 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12147 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12148 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12149 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12150 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12151 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12152 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12153 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12154 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12155 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12156 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12157 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12158 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12159}
12160
12161
12162IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12163{
12164 RTUINT128U uSrc1 = *puDst;
12165
12166 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12167 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12168 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12169 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12170 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12171 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12172 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12173 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12174 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12175 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12176 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12177 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12178 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12179 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12180 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12181 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12182}
12183
12184
12185IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12186{
12187 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12188 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12189 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12190 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12191 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12192 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12193 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12194 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12195 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12196 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12197 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12198 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12199 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12200 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12201 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12202 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12203}
12204
12205
12206IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12207{
12208 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12209 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12210 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12211 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12212 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12213 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12214 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12215 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12216 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12217 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12218 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12219 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12220 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12221 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12222 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12223 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12224 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
12225 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
12226 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
12227 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
12228 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
12229 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
12230 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
12231 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
12232 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
12233 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
12234 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
12235 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
12236 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
12237 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
12238 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
12239 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
12240}
12241
12242
12243IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12244{
12245 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12246 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12247 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12248 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12249 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12250 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12251 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12252 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12253}
12254
12255
12256IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12257{
12258 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12259 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12260 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12261 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12262 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12263 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12264 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12265 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12266 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
12267 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
12268 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
12269 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
12270 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
12271 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
12272 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
12273 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
12274}
12275
12276#undef PAVGB_EXEC
12277#undef PAVGW_EXEC
12278
12279
12280/*
12281 * PMOVMSKB / VPMOVMSKB
12282 */
12283#ifdef IEM_WITHOUT_ASSEMBLY
12284
12285IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
12286{
12287 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12288 uint64_t const uSrc = *pu64Src;
12289 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
12290 | ((uSrc >> (15-1)) & RT_BIT_64(1))
12291 | ((uSrc >> (23-2)) & RT_BIT_64(2))
12292 | ((uSrc >> (31-3)) & RT_BIT_64(3))
12293 | ((uSrc >> (39-4)) & RT_BIT_64(4))
12294 | ((uSrc >> (47-5)) & RT_BIT_64(5))
12295 | ((uSrc >> (55-6)) & RT_BIT_64(6))
12296 | ((uSrc >> (63-7)) & RT_BIT_64(7));
12297}
12298
12299
12300IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
12301{
12302 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12303 uint64_t const uSrc0 = pu128Src->QWords.qw0;
12304 uint64_t const uSrc1 = pu128Src->QWords.qw1;
12305 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12306 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12307 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12308 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12309 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12310 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12311 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12312 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12313 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12314 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12315 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12316 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12317 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12318 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12319 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12320 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
12321}
12322
12323#endif
12324
12325IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
12326{
12327 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12328 uint64_t const uSrc0 = puSrc->QWords.qw0;
12329 uint64_t const uSrc1 = puSrc->QWords.qw1;
12330 uint64_t const uSrc2 = puSrc->QWords.qw2;
12331 uint64_t const uSrc3 = puSrc->QWords.qw3;
12332 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12333 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12334 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12335 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12336 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12337 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12338 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12339 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12340 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12341 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12342 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12343 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12344 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12345 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12346 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12347 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
12348 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
12349 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
12350 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
12351 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
12352 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
12353 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
12354 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
12355 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
12356 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
12357 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
12358 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
12359 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
12360 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
12361 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
12362 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
12363 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
12364}
12365
12366
12367/*
12368 * [V]PSHUFB
12369 */
12370
12371IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12372{
12373 RTUINT64U const uSrc = { *puSrc };
12374 RTUINT64U const uDstIn = { *puDst };
12375 ASMCompilerBarrier();
12376 RTUINT64U uDstOut = { 0 };
12377 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
12378 {
12379 uint8_t idxSrc = uSrc.au8[iByte];
12380 if (!(idxSrc & 0x80))
12381 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
12382 }
12383 *puDst = uDstOut.u;
12384 RT_NOREF(pFpuState);
12385}
12386
12387
12388IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12389{
12390 RTUINT128U const uSrc = *puSrc;
12391 RTUINT128U const uDstIn = *puDst;
12392 ASMCompilerBarrier();
12393 puDst->au64[0] = 0;
12394 puDst->au64[1] = 0;
12395 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12396 {
12397 uint8_t idxSrc = uSrc.au8[iByte];
12398 if (!(idxSrc & 0x80))
12399 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
12400 }
12401 RT_NOREF(pFpuState);
12402}
12403
12404
12405IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12406 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12407{
12408 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
12409 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
12410 ASMCompilerBarrier();
12411 puDst->au64[0] = 0;
12412 puDst->au64[1] = 0;
12413 for (unsigned iByte = 0; iByte < 16; iByte++)
12414 {
12415 uint8_t idxSrc = uSrc2.au8[iByte];
12416 if (!(idxSrc & 0x80))
12417 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12418 }
12419 RT_NOREF(pExtState);
12420}
12421
12422
12423IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12424 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12425{
12426 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
12427 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
12428 ASMCompilerBarrier();
12429 puDst->au64[0] = 0;
12430 puDst->au64[1] = 0;
12431 puDst->au64[2] = 0;
12432 puDst->au64[3] = 0;
12433 for (unsigned iByte = 0; iByte < 16; iByte++)
12434 {
12435 uint8_t idxSrc = uSrc2.au8[iByte];
12436 if (!(idxSrc & 0x80))
12437 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12438 }
12439 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12440 {
12441 uint8_t idxSrc = uSrc2.au8[iByte];
12442 if (!(idxSrc & 0x80))
12443 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
12444 }
12445 RT_NOREF(pExtState);
12446}
12447
12448
12449/*
12450 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
12451 */
12452#ifdef IEM_WITHOUT_ASSEMBLY
12453
12454IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
12455{
12456 uint64_t const uSrc = *puSrc;
12457 ASMCompilerBarrier();
12458 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12459 uSrc >> (((bEvil >> 2) & 3) * 16),
12460 uSrc >> (((bEvil >> 4) & 3) * 16),
12461 uSrc >> (((bEvil >> 6) & 3) * 16));
12462}
12463
12464
12465IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12466{
12467 puDst->QWords.qw0 = puSrc->QWords.qw0;
12468 uint64_t const uSrc = puSrc->QWords.qw1;
12469 ASMCompilerBarrier();
12470 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12471 uSrc >> (((bEvil >> 2) & 3) * 16),
12472 uSrc >> (((bEvil >> 4) & 3) * 16),
12473 uSrc >> (((bEvil >> 6) & 3) * 16));
12474}
12475
12476#endif
12477
12478IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12479{
12480 puDst->QWords.qw0 = puSrc->QWords.qw0;
12481 uint64_t const uSrc1 = puSrc->QWords.qw1;
12482 puDst->QWords.qw2 = puSrc->QWords.qw2;
12483 uint64_t const uSrc3 = puSrc->QWords.qw3;
12484 ASMCompilerBarrier();
12485 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12486 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12487 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12488 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12489 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12490 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12491 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12492 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12493}
12494
12495#ifdef IEM_WITHOUT_ASSEMBLY
12496IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12497{
12498 puDst->QWords.qw1 = puSrc->QWords.qw1;
12499 uint64_t const uSrc = puSrc->QWords.qw0;
12500 ASMCompilerBarrier();
12501 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12502 uSrc >> (((bEvil >> 2) & 3) * 16),
12503 uSrc >> (((bEvil >> 4) & 3) * 16),
12504 uSrc >> (((bEvil >> 6) & 3) * 16));
12505
12506}
12507#endif
12508
12509
12510IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12511{
12512 puDst->QWords.qw3 = puSrc->QWords.qw3;
12513 uint64_t const uSrc2 = puSrc->QWords.qw2;
12514 puDst->QWords.qw1 = puSrc->QWords.qw1;
12515 uint64_t const uSrc0 = puSrc->QWords.qw0;
12516 ASMCompilerBarrier();
12517 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12518 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12519 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12520 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12521 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12522 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12523 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12524 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12525
12526}
12527
12528
12529#ifdef IEM_WITHOUT_ASSEMBLY
12530IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12531{
12532 RTUINT128U const uSrc = *puSrc;
12533 ASMCompilerBarrier();
12534 puDst->au32[0] = uSrc.au32[bEvil & 3];
12535 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12536 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12537 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12538}
12539#endif
12540
12541
12542IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12543{
12544 RTUINT256U const uSrc = *puSrc;
12545 ASMCompilerBarrier();
12546 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12547 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12548 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12549 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12550 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12551 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12552 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12553 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12554}
12555
12556
12557/*
12558 * PUNPCKHBW - high bytes -> words
12559 */
12560#ifdef IEM_WITHOUT_ASSEMBLY
12561
12562IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12563{
12564 RTUINT64U const uSrc2 = { *puSrc };
12565 RTUINT64U const uSrc1 = { *puDst };
12566 ASMCompilerBarrier();
12567 RTUINT64U uDstOut;
12568 uDstOut.au8[0] = uSrc1.au8[4];
12569 uDstOut.au8[1] = uSrc2.au8[4];
12570 uDstOut.au8[2] = uSrc1.au8[5];
12571 uDstOut.au8[3] = uSrc2.au8[5];
12572 uDstOut.au8[4] = uSrc1.au8[6];
12573 uDstOut.au8[5] = uSrc2.au8[6];
12574 uDstOut.au8[6] = uSrc1.au8[7];
12575 uDstOut.au8[7] = uSrc2.au8[7];
12576 *puDst = uDstOut.u;
12577}
12578
12579
12580IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12581{
12582 RTUINT128U const uSrc2 = *puSrc;
12583 RTUINT128U const uSrc1 = *puDst;
12584 ASMCompilerBarrier();
12585 RTUINT128U uDstOut;
12586 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12587 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12588 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12589 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12590 uDstOut.au8[ 4] = uSrc1.au8[10];
12591 uDstOut.au8[ 5] = uSrc2.au8[10];
12592 uDstOut.au8[ 6] = uSrc1.au8[11];
12593 uDstOut.au8[ 7] = uSrc2.au8[11];
12594 uDstOut.au8[ 8] = uSrc1.au8[12];
12595 uDstOut.au8[ 9] = uSrc2.au8[12];
12596 uDstOut.au8[10] = uSrc1.au8[13];
12597 uDstOut.au8[11] = uSrc2.au8[13];
12598 uDstOut.au8[12] = uSrc1.au8[14];
12599 uDstOut.au8[13] = uSrc2.au8[14];
12600 uDstOut.au8[14] = uSrc1.au8[15];
12601 uDstOut.au8[15] = uSrc2.au8[15];
12602 *puDst = uDstOut;
12603}
12604
12605#endif
12606
12607IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12608{
12609 RTUINT128U const uSrc2 = *puSrc2;
12610 RTUINT128U const uSrc1 = *puSrc1;
12611 ASMCompilerBarrier();
12612 RTUINT128U uDstOut;
12613 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12614 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12615 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12616 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12617 uDstOut.au8[ 4] = uSrc1.au8[10];
12618 uDstOut.au8[ 5] = uSrc2.au8[10];
12619 uDstOut.au8[ 6] = uSrc1.au8[11];
12620 uDstOut.au8[ 7] = uSrc2.au8[11];
12621 uDstOut.au8[ 8] = uSrc1.au8[12];
12622 uDstOut.au8[ 9] = uSrc2.au8[12];
12623 uDstOut.au8[10] = uSrc1.au8[13];
12624 uDstOut.au8[11] = uSrc2.au8[13];
12625 uDstOut.au8[12] = uSrc1.au8[14];
12626 uDstOut.au8[13] = uSrc2.au8[14];
12627 uDstOut.au8[14] = uSrc1.au8[15];
12628 uDstOut.au8[15] = uSrc2.au8[15];
12629 *puDst = uDstOut;
12630}
12631
12632
12633IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12634{
12635 RTUINT256U const uSrc2 = *puSrc2;
12636 RTUINT256U const uSrc1 = *puSrc1;
12637 ASMCompilerBarrier();
12638 RTUINT256U uDstOut;
12639 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12640 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12641 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12642 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12643 uDstOut.au8[ 4] = uSrc1.au8[10];
12644 uDstOut.au8[ 5] = uSrc2.au8[10];
12645 uDstOut.au8[ 6] = uSrc1.au8[11];
12646 uDstOut.au8[ 7] = uSrc2.au8[11];
12647 uDstOut.au8[ 8] = uSrc1.au8[12];
12648 uDstOut.au8[ 9] = uSrc2.au8[12];
12649 uDstOut.au8[10] = uSrc1.au8[13];
12650 uDstOut.au8[11] = uSrc2.au8[13];
12651 uDstOut.au8[12] = uSrc1.au8[14];
12652 uDstOut.au8[13] = uSrc2.au8[14];
12653 uDstOut.au8[14] = uSrc1.au8[15];
12654 uDstOut.au8[15] = uSrc2.au8[15];
12655 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12656 uDstOut.au8[16] = uSrc1.au8[24];
12657 uDstOut.au8[17] = uSrc2.au8[24];
12658 uDstOut.au8[18] = uSrc1.au8[25];
12659 uDstOut.au8[19] = uSrc2.au8[25];
12660 uDstOut.au8[20] = uSrc1.au8[26];
12661 uDstOut.au8[21] = uSrc2.au8[26];
12662 uDstOut.au8[22] = uSrc1.au8[27];
12663 uDstOut.au8[23] = uSrc2.au8[27];
12664 uDstOut.au8[24] = uSrc1.au8[28];
12665 uDstOut.au8[25] = uSrc2.au8[28];
12666 uDstOut.au8[26] = uSrc1.au8[29];
12667 uDstOut.au8[27] = uSrc2.au8[29];
12668 uDstOut.au8[28] = uSrc1.au8[30];
12669 uDstOut.au8[29] = uSrc2.au8[30];
12670 uDstOut.au8[30] = uSrc1.au8[31];
12671 uDstOut.au8[31] = uSrc2.au8[31];
12672 *puDst = uDstOut;
12673}
12674
12675
12676/*
12677 * PUNPCKHBW - high words -> dwords
12678 */
12679#ifdef IEM_WITHOUT_ASSEMBLY
12680
12681IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12682{
12683 RTUINT64U const uSrc2 = { *puSrc };
12684 RTUINT64U const uSrc1 = { *puDst };
12685 ASMCompilerBarrier();
12686 RTUINT64U uDstOut;
12687 uDstOut.au16[0] = uSrc1.au16[2];
12688 uDstOut.au16[1] = uSrc2.au16[2];
12689 uDstOut.au16[2] = uSrc1.au16[3];
12690 uDstOut.au16[3] = uSrc2.au16[3];
12691 *puDst = uDstOut.u;
12692}
12693
12694
12695IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12696{
12697 RTUINT128U const uSrc2 = *puSrc;
12698 RTUINT128U const uSrc1 = *puDst;
12699 ASMCompilerBarrier();
12700 RTUINT128U uDstOut;
12701 uDstOut.au16[0] = uSrc1.au16[4];
12702 uDstOut.au16[1] = uSrc2.au16[4];
12703 uDstOut.au16[2] = uSrc1.au16[5];
12704 uDstOut.au16[3] = uSrc2.au16[5];
12705 uDstOut.au16[4] = uSrc1.au16[6];
12706 uDstOut.au16[5] = uSrc2.au16[6];
12707 uDstOut.au16[6] = uSrc1.au16[7];
12708 uDstOut.au16[7] = uSrc2.au16[7];
12709 *puDst = uDstOut;
12710}
12711
12712#endif
12713
12714IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12715{
12716 RTUINT128U const uSrc2 = *puSrc2;
12717 RTUINT128U const uSrc1 = *puSrc1;
12718 ASMCompilerBarrier();
12719 RTUINT128U uDstOut;
12720 uDstOut.au16[0] = uSrc1.au16[4];
12721 uDstOut.au16[1] = uSrc2.au16[4];
12722 uDstOut.au16[2] = uSrc1.au16[5];
12723 uDstOut.au16[3] = uSrc2.au16[5];
12724 uDstOut.au16[4] = uSrc1.au16[6];
12725 uDstOut.au16[5] = uSrc2.au16[6];
12726 uDstOut.au16[6] = uSrc1.au16[7];
12727 uDstOut.au16[7] = uSrc2.au16[7];
12728 *puDst = uDstOut;
12729}
12730
12731
12732IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12733{
12734 RTUINT256U const uSrc2 = *puSrc2;
12735 RTUINT256U const uSrc1 = *puSrc1;
12736 ASMCompilerBarrier();
12737 RTUINT256U uDstOut;
12738 uDstOut.au16[0] = uSrc1.au16[4];
12739 uDstOut.au16[1] = uSrc2.au16[4];
12740 uDstOut.au16[2] = uSrc1.au16[5];
12741 uDstOut.au16[3] = uSrc2.au16[5];
12742 uDstOut.au16[4] = uSrc1.au16[6];
12743 uDstOut.au16[5] = uSrc2.au16[6];
12744 uDstOut.au16[6] = uSrc1.au16[7];
12745 uDstOut.au16[7] = uSrc2.au16[7];
12746
12747 uDstOut.au16[8] = uSrc1.au16[12];
12748 uDstOut.au16[9] = uSrc2.au16[12];
12749 uDstOut.au16[10] = uSrc1.au16[13];
12750 uDstOut.au16[11] = uSrc2.au16[13];
12751 uDstOut.au16[12] = uSrc1.au16[14];
12752 uDstOut.au16[13] = uSrc2.au16[14];
12753 uDstOut.au16[14] = uSrc1.au16[15];
12754 uDstOut.au16[15] = uSrc2.au16[15];
12755 *puDst = uDstOut;
12756}
12757
12758
12759/*
12760 * PUNPCKHBW - high dwords -> qword(s)
12761 */
12762#ifdef IEM_WITHOUT_ASSEMBLY
12763
12764IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12765{
12766 RTUINT64U const uSrc2 = { *puSrc };
12767 RTUINT64U const uSrc1 = { *puDst };
12768 ASMCompilerBarrier();
12769 RTUINT64U uDstOut;
12770 uDstOut.au32[0] = uSrc1.au32[1];
12771 uDstOut.au32[1] = uSrc2.au32[1];
12772 *puDst = uDstOut.u;
12773}
12774
12775
12776IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12777{
12778 RTUINT128U const uSrc2 = *puSrc;
12779 RTUINT128U const uSrc1 = *puDst;
12780 ASMCompilerBarrier();
12781 RTUINT128U uDstOut;
12782 uDstOut.au32[0] = uSrc1.au32[2];
12783 uDstOut.au32[1] = uSrc2.au32[2];
12784 uDstOut.au32[2] = uSrc1.au32[3];
12785 uDstOut.au32[3] = uSrc2.au32[3];
12786 *puDst = uDstOut;
12787}
12788
12789#endif
12790
12791IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12792{
12793 RTUINT128U const uSrc2 = *puSrc2;
12794 RTUINT128U const uSrc1 = *puSrc1;
12795 ASMCompilerBarrier();
12796 RTUINT128U uDstOut;
12797 uDstOut.au32[0] = uSrc1.au32[2];
12798 uDstOut.au32[1] = uSrc2.au32[2];
12799 uDstOut.au32[2] = uSrc1.au32[3];
12800 uDstOut.au32[3] = uSrc2.au32[3];
12801 *puDst = uDstOut;
12802}
12803
12804
12805IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12806{
12807 RTUINT256U const uSrc2 = *puSrc2;
12808 RTUINT256U const uSrc1 = *puSrc1;
12809 ASMCompilerBarrier();
12810 RTUINT256U uDstOut;
12811 uDstOut.au32[0] = uSrc1.au32[2];
12812 uDstOut.au32[1] = uSrc2.au32[2];
12813 uDstOut.au32[2] = uSrc1.au32[3];
12814 uDstOut.au32[3] = uSrc2.au32[3];
12815
12816 uDstOut.au32[4] = uSrc1.au32[6];
12817 uDstOut.au32[5] = uSrc2.au32[6];
12818 uDstOut.au32[6] = uSrc1.au32[7];
12819 uDstOut.au32[7] = uSrc2.au32[7];
12820 *puDst = uDstOut;
12821}
12822
12823
12824/*
12825 * PUNPCKHQDQ -> High qwords -> double qword(s).
12826 */
12827#ifdef IEM_WITHOUT_ASSEMBLY
12828IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12829{
12830 RTUINT128U const uSrc2 = *puSrc;
12831 RTUINT128U const uSrc1 = *puDst;
12832 ASMCompilerBarrier();
12833 RTUINT128U uDstOut;
12834 uDstOut.au64[0] = uSrc1.au64[1];
12835 uDstOut.au64[1] = uSrc2.au64[1];
12836 *puDst = uDstOut;
12837}
12838#endif
12839
12840
12841IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12842{
12843 RTUINT128U const uSrc2 = *puSrc2;
12844 RTUINT128U const uSrc1 = *puSrc1;
12845 ASMCompilerBarrier();
12846 RTUINT128U uDstOut;
12847 uDstOut.au64[0] = uSrc1.au64[1];
12848 uDstOut.au64[1] = uSrc2.au64[1];
12849 *puDst = uDstOut;
12850}
12851
12852
12853IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12854{
12855 RTUINT256U const uSrc2 = *puSrc2;
12856 RTUINT256U const uSrc1 = *puSrc1;
12857 ASMCompilerBarrier();
12858 RTUINT256U uDstOut;
12859 uDstOut.au64[0] = uSrc1.au64[1];
12860 uDstOut.au64[1] = uSrc2.au64[1];
12861
12862 uDstOut.au64[2] = uSrc1.au64[3];
12863 uDstOut.au64[3] = uSrc2.au64[3];
12864 *puDst = uDstOut;
12865}
12866
12867
12868/*
12869 * PUNPCKLBW - low bytes -> words
12870 */
12871#ifdef IEM_WITHOUT_ASSEMBLY
12872
12873IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12874{
12875 RTUINT64U const uSrc2 = { *puSrc };
12876 RTUINT64U const uSrc1 = { *puDst };
12877 ASMCompilerBarrier();
12878 RTUINT64U uDstOut;
12879 uDstOut.au8[0] = uSrc1.au8[0];
12880 uDstOut.au8[1] = uSrc2.au8[0];
12881 uDstOut.au8[2] = uSrc1.au8[1];
12882 uDstOut.au8[3] = uSrc2.au8[1];
12883 uDstOut.au8[4] = uSrc1.au8[2];
12884 uDstOut.au8[5] = uSrc2.au8[2];
12885 uDstOut.au8[6] = uSrc1.au8[3];
12886 uDstOut.au8[7] = uSrc2.au8[3];
12887 *puDst = uDstOut.u;
12888}
12889
12890
12891IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12892{
12893 RTUINT128U const uSrc2 = *puSrc;
12894 RTUINT128U const uSrc1 = *puDst;
12895 ASMCompilerBarrier();
12896 RTUINT128U uDstOut;
12897 uDstOut.au8[ 0] = uSrc1.au8[0];
12898 uDstOut.au8[ 1] = uSrc2.au8[0];
12899 uDstOut.au8[ 2] = uSrc1.au8[1];
12900 uDstOut.au8[ 3] = uSrc2.au8[1];
12901 uDstOut.au8[ 4] = uSrc1.au8[2];
12902 uDstOut.au8[ 5] = uSrc2.au8[2];
12903 uDstOut.au8[ 6] = uSrc1.au8[3];
12904 uDstOut.au8[ 7] = uSrc2.au8[3];
12905 uDstOut.au8[ 8] = uSrc1.au8[4];
12906 uDstOut.au8[ 9] = uSrc2.au8[4];
12907 uDstOut.au8[10] = uSrc1.au8[5];
12908 uDstOut.au8[11] = uSrc2.au8[5];
12909 uDstOut.au8[12] = uSrc1.au8[6];
12910 uDstOut.au8[13] = uSrc2.au8[6];
12911 uDstOut.au8[14] = uSrc1.au8[7];
12912 uDstOut.au8[15] = uSrc2.au8[7];
12913 *puDst = uDstOut;
12914}
12915
12916#endif
12917
12918IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12919{
12920 RTUINT128U const uSrc2 = *puSrc2;
12921 RTUINT128U const uSrc1 = *puSrc1;
12922 ASMCompilerBarrier();
12923 RTUINT128U uDstOut;
12924 uDstOut.au8[ 0] = uSrc1.au8[0];
12925 uDstOut.au8[ 1] = uSrc2.au8[0];
12926 uDstOut.au8[ 2] = uSrc1.au8[1];
12927 uDstOut.au8[ 3] = uSrc2.au8[1];
12928 uDstOut.au8[ 4] = uSrc1.au8[2];
12929 uDstOut.au8[ 5] = uSrc2.au8[2];
12930 uDstOut.au8[ 6] = uSrc1.au8[3];
12931 uDstOut.au8[ 7] = uSrc2.au8[3];
12932 uDstOut.au8[ 8] = uSrc1.au8[4];
12933 uDstOut.au8[ 9] = uSrc2.au8[4];
12934 uDstOut.au8[10] = uSrc1.au8[5];
12935 uDstOut.au8[11] = uSrc2.au8[5];
12936 uDstOut.au8[12] = uSrc1.au8[6];
12937 uDstOut.au8[13] = uSrc2.au8[6];
12938 uDstOut.au8[14] = uSrc1.au8[7];
12939 uDstOut.au8[15] = uSrc2.au8[7];
12940 *puDst = uDstOut;
12941}
12942
12943
12944IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12945{
12946 RTUINT256U const uSrc2 = *puSrc2;
12947 RTUINT256U const uSrc1 = *puSrc1;
12948 ASMCompilerBarrier();
12949 RTUINT256U uDstOut;
12950 uDstOut.au8[ 0] = uSrc1.au8[0];
12951 uDstOut.au8[ 1] = uSrc2.au8[0];
12952 uDstOut.au8[ 2] = uSrc1.au8[1];
12953 uDstOut.au8[ 3] = uSrc2.au8[1];
12954 uDstOut.au8[ 4] = uSrc1.au8[2];
12955 uDstOut.au8[ 5] = uSrc2.au8[2];
12956 uDstOut.au8[ 6] = uSrc1.au8[3];
12957 uDstOut.au8[ 7] = uSrc2.au8[3];
12958 uDstOut.au8[ 8] = uSrc1.au8[4];
12959 uDstOut.au8[ 9] = uSrc2.au8[4];
12960 uDstOut.au8[10] = uSrc1.au8[5];
12961 uDstOut.au8[11] = uSrc2.au8[5];
12962 uDstOut.au8[12] = uSrc1.au8[6];
12963 uDstOut.au8[13] = uSrc2.au8[6];
12964 uDstOut.au8[14] = uSrc1.au8[7];
12965 uDstOut.au8[15] = uSrc2.au8[7];
12966 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12967 uDstOut.au8[16] = uSrc1.au8[16];
12968 uDstOut.au8[17] = uSrc2.au8[16];
12969 uDstOut.au8[18] = uSrc1.au8[17];
12970 uDstOut.au8[19] = uSrc2.au8[17];
12971 uDstOut.au8[20] = uSrc1.au8[18];
12972 uDstOut.au8[21] = uSrc2.au8[18];
12973 uDstOut.au8[22] = uSrc1.au8[19];
12974 uDstOut.au8[23] = uSrc2.au8[19];
12975 uDstOut.au8[24] = uSrc1.au8[20];
12976 uDstOut.au8[25] = uSrc2.au8[20];
12977 uDstOut.au8[26] = uSrc1.au8[21];
12978 uDstOut.au8[27] = uSrc2.au8[21];
12979 uDstOut.au8[28] = uSrc1.au8[22];
12980 uDstOut.au8[29] = uSrc2.au8[22];
12981 uDstOut.au8[30] = uSrc1.au8[23];
12982 uDstOut.au8[31] = uSrc2.au8[23];
12983 *puDst = uDstOut;
12984}
12985
12986
12987/*
12988 * PUNPCKLBW - low words -> dwords
12989 */
12990#ifdef IEM_WITHOUT_ASSEMBLY
12991
12992IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12993{
12994 RTUINT64U const uSrc2 = { *puSrc };
12995 RTUINT64U const uSrc1 = { *puDst };
12996 ASMCompilerBarrier();
12997 RTUINT64U uDstOut;
12998 uDstOut.au16[0] = uSrc1.au16[0];
12999 uDstOut.au16[1] = uSrc2.au16[0];
13000 uDstOut.au16[2] = uSrc1.au16[1];
13001 uDstOut.au16[3] = uSrc2.au16[1];
13002 *puDst = uDstOut.u;
13003}
13004
13005
13006IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13007{
13008 RTUINT128U const uSrc2 = *puSrc;
13009 RTUINT128U const uSrc1 = *puDst;
13010 ASMCompilerBarrier();
13011 RTUINT128U uDstOut;
13012 uDstOut.au16[0] = uSrc1.au16[0];
13013 uDstOut.au16[1] = uSrc2.au16[0];
13014 uDstOut.au16[2] = uSrc1.au16[1];
13015 uDstOut.au16[3] = uSrc2.au16[1];
13016 uDstOut.au16[4] = uSrc1.au16[2];
13017 uDstOut.au16[5] = uSrc2.au16[2];
13018 uDstOut.au16[6] = uSrc1.au16[3];
13019 uDstOut.au16[7] = uSrc2.au16[3];
13020 *puDst = uDstOut;
13021}
13022
13023#endif
13024
13025IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13026{
13027 RTUINT128U const uSrc2 = *puSrc2;
13028 RTUINT128U const uSrc1 = *puSrc1;
13029 ASMCompilerBarrier();
13030 RTUINT128U uDstOut;
13031 uDstOut.au16[0] = uSrc1.au16[0];
13032 uDstOut.au16[1] = uSrc2.au16[0];
13033 uDstOut.au16[2] = uSrc1.au16[1];
13034 uDstOut.au16[3] = uSrc2.au16[1];
13035 uDstOut.au16[4] = uSrc1.au16[2];
13036 uDstOut.au16[5] = uSrc2.au16[2];
13037 uDstOut.au16[6] = uSrc1.au16[3];
13038 uDstOut.au16[7] = uSrc2.au16[3];
13039 *puDst = uDstOut;
13040}
13041
13042
13043IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13044{
13045 RTUINT256U const uSrc2 = *puSrc2;
13046 RTUINT256U const uSrc1 = *puSrc1;
13047 ASMCompilerBarrier();
13048 RTUINT256U uDstOut;
13049 uDstOut.au16[0] = uSrc1.au16[0];
13050 uDstOut.au16[1] = uSrc2.au16[0];
13051 uDstOut.au16[2] = uSrc1.au16[1];
13052 uDstOut.au16[3] = uSrc2.au16[1];
13053 uDstOut.au16[4] = uSrc1.au16[2];
13054 uDstOut.au16[5] = uSrc2.au16[2];
13055 uDstOut.au16[6] = uSrc1.au16[3];
13056 uDstOut.au16[7] = uSrc2.au16[3];
13057
13058 uDstOut.au16[8] = uSrc1.au16[8];
13059 uDstOut.au16[9] = uSrc2.au16[8];
13060 uDstOut.au16[10] = uSrc1.au16[9];
13061 uDstOut.au16[11] = uSrc2.au16[9];
13062 uDstOut.au16[12] = uSrc1.au16[10];
13063 uDstOut.au16[13] = uSrc2.au16[10];
13064 uDstOut.au16[14] = uSrc1.au16[11];
13065 uDstOut.au16[15] = uSrc2.au16[11];
13066 *puDst = uDstOut;
13067}
13068
13069
13070/*
13071 * PUNPCKLBW - low dwords -> qword(s)
13072 */
13073#ifdef IEM_WITHOUT_ASSEMBLY
13074
13075IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
13076{
13077 RTUINT64U const uSrc2 = { *puSrc };
13078 RTUINT64U const uSrc1 = { *puDst };
13079 ASMCompilerBarrier();
13080 RTUINT64U uDstOut;
13081 uDstOut.au32[0] = uSrc1.au32[0];
13082 uDstOut.au32[1] = uSrc2.au32[0];
13083 *puDst = uDstOut.u;
13084}
13085
13086
13087IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13088{
13089 RTUINT128U const uSrc2 = *puSrc;
13090 RTUINT128U const uSrc1 = *puDst;
13091 ASMCompilerBarrier();
13092 RTUINT128U uDstOut;
13093 uDstOut.au32[0] = uSrc1.au32[0];
13094 uDstOut.au32[1] = uSrc2.au32[0];
13095 uDstOut.au32[2] = uSrc1.au32[1];
13096 uDstOut.au32[3] = uSrc2.au32[1];
13097 *puDst = uDstOut;
13098}
13099
13100#endif
13101
13102IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13103{
13104 RTUINT128U const uSrc2 = *puSrc2;
13105 RTUINT128U const uSrc1 = *puSrc1;
13106 ASMCompilerBarrier();
13107 RTUINT128U uDstOut;
13108 uDstOut.au32[0] = uSrc1.au32[0];
13109 uDstOut.au32[1] = uSrc2.au32[0];
13110 uDstOut.au32[2] = uSrc1.au32[1];
13111 uDstOut.au32[3] = uSrc2.au32[1];
13112 *puDst = uDstOut;
13113}
13114
13115
13116IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13117{
13118 RTUINT256U const uSrc2 = *puSrc2;
13119 RTUINT256U const uSrc1 = *puSrc1;
13120 ASMCompilerBarrier();
13121 RTUINT256U uDstOut;
13122 uDstOut.au32[0] = uSrc1.au32[0];
13123 uDstOut.au32[1] = uSrc2.au32[0];
13124 uDstOut.au32[2] = uSrc1.au32[1];
13125 uDstOut.au32[3] = uSrc2.au32[1];
13126
13127 uDstOut.au32[4] = uSrc1.au32[4];
13128 uDstOut.au32[5] = uSrc2.au32[4];
13129 uDstOut.au32[6] = uSrc1.au32[5];
13130 uDstOut.au32[7] = uSrc2.au32[5];
13131 *puDst = uDstOut;
13132}
13133
13134
13135/*
13136 * PUNPCKLQDQ -> Low qwords -> double qword(s).
13137 */
13138#ifdef IEM_WITHOUT_ASSEMBLY
13139IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13140{
13141 RTUINT128U const uSrc2 = *puSrc;
13142 RTUINT128U const uSrc1 = *puDst;
13143 ASMCompilerBarrier();
13144 RTUINT128U uDstOut;
13145 uDstOut.au64[0] = uSrc1.au64[0];
13146 uDstOut.au64[1] = uSrc2.au64[0];
13147 *puDst = uDstOut;
13148}
13149#endif
13150
13151
13152IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13153{
13154 RTUINT128U const uSrc2 = *puSrc2;
13155 RTUINT128U const uSrc1 = *puSrc1;
13156 ASMCompilerBarrier();
13157 RTUINT128U uDstOut;
13158 uDstOut.au64[0] = uSrc1.au64[0];
13159 uDstOut.au64[1] = uSrc2.au64[0];
13160 *puDst = uDstOut;
13161}
13162
13163
13164IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13165{
13166 RTUINT256U const uSrc2 = *puSrc2;
13167 RTUINT256U const uSrc1 = *puSrc1;
13168 ASMCompilerBarrier();
13169 RTUINT256U uDstOut;
13170 uDstOut.au64[0] = uSrc1.au64[0];
13171 uDstOut.au64[1] = uSrc2.au64[0];
13172
13173 uDstOut.au64[2] = uSrc1.au64[2];
13174 uDstOut.au64[3] = uSrc2.au64[2];
13175 *puDst = uDstOut;
13176}
13177
13178
13179/*
13180 * PACKSSWB - signed words -> signed bytes
13181 */
13182
13183#ifdef IEM_WITHOUT_ASSEMBLY
13184
13185IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13186{
13187 RTUINT64U const uSrc2 = { *puSrc };
13188 RTUINT64U const uSrc1 = { *puDst };
13189 ASMCompilerBarrier();
13190 RTUINT64U uDstOut;
13191 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13192 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13193 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13194 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13195 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13196 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13197 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13198 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13199 *puDst = uDstOut.u;
13200}
13201
13202
13203IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13204{
13205 RTUINT128U const uSrc2 = *puSrc;
13206 RTUINT128U const uSrc1 = *puDst;
13207 ASMCompilerBarrier();
13208 RTUINT128U uDstOut;
13209 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13210 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13211 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13212 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13213 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13214 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13215 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13216 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13217 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13218 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13219 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13220 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13221 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13222 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13223 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13224 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13225 *puDst = uDstOut;
13226}
13227
13228#endif
13229
13230IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13231{
13232 RTUINT128U const uSrc2 = *puSrc2;
13233 RTUINT128U const uSrc1 = *puSrc1;
13234 ASMCompilerBarrier();
13235 RTUINT128U uDstOut;
13236 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13237 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13238 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13239 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13240 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13241 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13242 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13243 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13244 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13245 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13246 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13247 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13248 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13249 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13250 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13251 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13252 *puDst = uDstOut;
13253}
13254
13255
13256IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13257{
13258 RTUINT256U const uSrc2 = *puSrc2;
13259 RTUINT256U const uSrc1 = *puSrc1;
13260 ASMCompilerBarrier();
13261 RTUINT256U uDstOut;
13262 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13263 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13264 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13265 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13266 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13267 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13268 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13269 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13270 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13271 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13272 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13273 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13274 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13275 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13276 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13277 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13278
13279 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
13280 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
13281 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
13282 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
13283 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
13284 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
13285 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
13286 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
13287 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
13288 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
13289 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
13290 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
13291 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
13292 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
13293 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
13294 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
13295 *puDst = uDstOut;
13296}
13297
13298
13299/*
13300 * PACKUSWB - signed words -> unsigned bytes
13301 */
13302#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
13303 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
13304 ? (uint8_t)(a_iWord) \
13305 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
13306
13307#ifdef IEM_WITHOUT_ASSEMBLY
13308
13309IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13310{
13311 RTUINT64U const uSrc2 = { *puSrc };
13312 RTUINT64U const uSrc1 = { *puDst };
13313 ASMCompilerBarrier();
13314 RTUINT64U uDstOut;
13315 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13316 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13317 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13318 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13319 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13320 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13321 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13322 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13323 *puDst = uDstOut.u;
13324}
13325
13326
13327IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13328{
13329 RTUINT128U const uSrc2 = *puSrc;
13330 RTUINT128U const uSrc1 = *puDst;
13331 ASMCompilerBarrier();
13332 RTUINT128U uDstOut;
13333 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13334 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13335 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13336 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13337 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13338 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13339 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13340 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13341 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13342 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13343 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13344 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13345 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13346 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13347 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13348 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13349 *puDst = uDstOut;
13350}
13351
13352#endif
13353
13354IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13355{
13356 RTUINT128U const uSrc2 = *puSrc2;
13357 RTUINT128U const uSrc1 = *puSrc1;
13358 ASMCompilerBarrier();
13359 RTUINT128U uDstOut;
13360 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13361 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13362 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13363 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13364 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13365 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13366 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13367 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13368 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13369 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13370 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13371 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13372 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13373 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13374 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13375 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13376 *puDst = uDstOut;
13377}
13378
13379
13380IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13381{
13382 RTUINT256U const uSrc2 = *puSrc2;
13383 RTUINT256U const uSrc1 = *puSrc1;
13384 ASMCompilerBarrier();
13385 RTUINT256U uDstOut;
13386 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13387 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13388 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13389 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13390 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13391 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13392 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13393 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13394 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13395 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13396 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13397 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13398 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13399 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13400 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13401 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13402
13403 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
13404 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
13405 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
13406 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
13407 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
13408 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
13409 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
13410 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
13411 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
13412 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
13413 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
13414 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
13415 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
13416 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
13417 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
13418 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
13419 *puDst = uDstOut;
13420}
13421
13422
13423/*
13424 * PACKSSDW - signed dwords -> signed words
13425 */
13426
13427#ifdef IEM_WITHOUT_ASSEMBLY
13428
13429IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13430{
13431 RTUINT64U const uSrc2 = { *puSrc };
13432 RTUINT64U const uSrc1 = { *puDst };
13433 ASMCompilerBarrier();
13434 RTUINT64U uDstOut;
13435 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13436 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13437 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13438 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13439 *puDst = uDstOut.u;
13440}
13441
13442
13443IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13444{
13445 RTUINT128U const uSrc2 = *puSrc;
13446 RTUINT128U const uSrc1 = *puDst;
13447 ASMCompilerBarrier();
13448 RTUINT128U uDstOut;
13449 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13450 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13451 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13452 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13453 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13454 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13455 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13456 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13457 *puDst = uDstOut;
13458}
13459
13460#endif
13461
13462IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13463{
13464 RTUINT128U const uSrc2 = *puSrc2;
13465 RTUINT128U const uSrc1 = *puSrc1;
13466 ASMCompilerBarrier();
13467 RTUINT128U uDstOut;
13468 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13469 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13470 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13471 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13472 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13473 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13474 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13475 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13476 *puDst = uDstOut;
13477}
13478
13479
13480IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13481{
13482 RTUINT256U const uSrc2 = *puSrc2;
13483 RTUINT256U const uSrc1 = *puSrc1;
13484 ASMCompilerBarrier();
13485 RTUINT256U uDstOut;
13486 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13487 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13488 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13489 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13490 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13491 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13492 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13493 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13494
13495 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13496 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13497 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13498 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13499 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13500 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13501 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13502 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13503 *puDst = uDstOut;
13504}
13505
13506
13507/*
13508 * PACKUSDW - signed dwords -> unsigned words
13509 */
13510#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13511 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13512 ? (uint16_t)(a_iDword) \
13513 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13514
13515#ifdef IEM_WITHOUT_ASSEMBLY
13516IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13517{
13518 RTUINT128U const uSrc2 = *puSrc;
13519 RTUINT128U const uSrc1 = *puDst;
13520 ASMCompilerBarrier();
13521 RTUINT128U uDstOut;
13522 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13523 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13524 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13525 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13526 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13527 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13528 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13529 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13530 *puDst = uDstOut;
13531}
13532#endif
13533
13534IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13535{
13536 RTUINT128U const uSrc2 = *puSrc2;
13537 RTUINT128U const uSrc1 = *puSrc1;
13538 ASMCompilerBarrier();
13539 RTUINT128U uDstOut;
13540 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13541 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13542 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13543 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13544 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13545 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13546 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13547 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13548 *puDst = uDstOut;
13549}
13550
13551
13552IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13553{
13554 RTUINT256U const uSrc2 = *puSrc2;
13555 RTUINT256U const uSrc1 = *puSrc1;
13556 ASMCompilerBarrier();
13557 RTUINT256U uDstOut;
13558 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13559 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13560 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13561 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13562 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13563 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13564 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13565 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13566
13567 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13568 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13569 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13570 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13571 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13572 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13573 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13574 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13575 *puDst = uDstOut;
13576}
13577
13578
13579/*
13580 * [V]PABSB / [V]PABSW / [V]PABSD
13581 */
13582
13583IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13584{
13585 RTUINT64U const uSrc = { *puSrc };
13586 RTUINT64U uDstOut = { 0 };
13587
13588 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13589 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13590 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13591 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13592 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13593 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13594 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13595 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13596 *puDst = uDstOut.u;
13597 RT_NOREF(pFpuState);
13598}
13599
13600
13601IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13602{
13603 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13604 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13605 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13606 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13607 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13608 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13609 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13610 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13611 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13612 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13613 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13614 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13615 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13616 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13617 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13618 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13619 RT_NOREF(pFpuState);
13620}
13621
13622
13623IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13624{
13625 RTUINT64U const uSrc = { *puSrc };
13626 RTUINT64U uDstOut = { 0 };
13627
13628 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13629 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13630 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13631 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13632 *puDst = uDstOut.u;
13633 RT_NOREF(pFpuState);
13634}
13635
13636
13637IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13638{
13639 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13640 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13641 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13642 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13643 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13644 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13645 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13646 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13647 RT_NOREF(pFpuState);
13648}
13649
13650
13651IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13652{
13653 RTUINT64U const uSrc = { *puSrc };
13654 RTUINT64U uDstOut = { 0 };
13655
13656 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13657 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13658 *puDst = uDstOut.u;
13659 RT_NOREF(pFpuState);
13660}
13661
13662
13663IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13664{
13665 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13666 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13667 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13668 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13669 RT_NOREF(pFpuState);
13670}
13671
13672
13673IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13674{
13675 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13676 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13677 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13678 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13679 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13680 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13681 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13682 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13683 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13684 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13685 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13686 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13687 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13688 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13689 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13690 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13691}
13692
13693
13694IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13695{
13696 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13697 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13698 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13699 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13700 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13701 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13702 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13703 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13704 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13705 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13706 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13707 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13708 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13709 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13710 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13711 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13712 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13713 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13714 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13715 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13716 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13717 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13718 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13719 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13720 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13721 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13722 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13723 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13724 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13725 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13726 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13727 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13728}
13729
13730
13731IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13732{
13733 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13734 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13735 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13736 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13737 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13738 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13739 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13740 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13741}
13742
13743
13744IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13745{
13746 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13747 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13748 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13749 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13750 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13751 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13752 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13753 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13754 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13755 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13756 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13757 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13758 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13759 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13760 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13761 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13762}
13763
13764
13765IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13766{
13767 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13768 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13769 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13770 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13771}
13772
13773
13774IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13775{
13776 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13777 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13778 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13779 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13780 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13781 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13782 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13783 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13784}
13785
13786
13787/*
13788 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13789 */
13790IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13791{
13792 RTUINT64U uSrc1 = { *puDst };
13793 RTUINT64U uSrc2 = { *puSrc };
13794 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13795
13796 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13797 {
13798 if (uSrc2.ai8[i] < 0)
13799 uDst.ai8[i] = -uSrc1.ai8[i];
13800 else if (uSrc2.ai8[i] == 0)
13801 uDst.ai8[i] = 0;
13802 else /* uSrc2.ai8[i] > 0 */
13803 uDst.ai8[i] = uSrc1.ai8[i];
13804 }
13805
13806 *puDst = uDst.u;
13807 RT_NOREF(pFpuState);
13808}
13809
13810
13811IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13812{
13813 RTUINT128U uSrc1 = *puDst;
13814
13815 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13816 {
13817 if (puSrc->ai8[i] < 0)
13818 puDst->ai8[i] = -uSrc1.ai8[i];
13819 else if (puSrc->ai8[i] == 0)
13820 puDst->ai8[i] = 0;
13821 else /* puSrc->ai8[i] > 0 */
13822 puDst->ai8[i] = uSrc1.ai8[i];
13823 }
13824
13825 RT_NOREF(pFpuState);
13826}
13827
13828
13829IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13830{
13831 RTUINT64U uSrc1 = { *puDst };
13832 RTUINT64U uSrc2 = { *puSrc };
13833 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13834
13835 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13836 {
13837 if (uSrc2.ai16[i] < 0)
13838 uDst.ai16[i] = -uSrc1.ai16[i];
13839 else if (uSrc2.ai16[i] == 0)
13840 uDst.ai16[i] = 0;
13841 else /* uSrc2.ai16[i] > 0 */
13842 uDst.ai16[i] = uSrc1.ai16[i];
13843 }
13844
13845 *puDst = uDst.u;
13846 RT_NOREF(pFpuState);
13847}
13848
13849
13850IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13851{
13852 RTUINT128U uSrc1 = *puDst;
13853
13854 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13855 {
13856 if (puSrc->ai16[i] < 0)
13857 puDst->ai16[i] = -uSrc1.ai16[i];
13858 else if (puSrc->ai16[i] == 0)
13859 puDst->ai16[i] = 0;
13860 else /* puSrc->ai16[i] > 0 */
13861 puDst->ai16[i] = uSrc1.ai16[i];
13862 }
13863
13864 RT_NOREF(pFpuState);
13865}
13866
13867
13868IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13869{
13870 RTUINT64U uSrc1 = { *puDst };
13871 RTUINT64U uSrc2 = { *puSrc };
13872 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13873
13874 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13875 {
13876 if (uSrc2.ai32[i] < 0)
13877 uDst.ai32[i] = -uSrc1.ai32[i];
13878 else if (uSrc2.ai32[i] == 0)
13879 uDst.ai32[i] = 0;
13880 else /* uSrc2.ai32[i] > 0 */
13881 uDst.ai32[i] = uSrc1.ai32[i];
13882 }
13883
13884 *puDst = uDst.u;
13885 RT_NOREF(pFpuState);
13886}
13887
13888
13889IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13890{
13891 RTUINT128U uSrc1 = *puDst;
13892
13893 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13894 {
13895 if (puSrc->ai32[i] < 0)
13896 puDst->ai32[i] = -uSrc1.ai32[i];
13897 else if (puSrc->ai32[i] == 0)
13898 puDst->ai32[i] = 0;
13899 else /* puSrc->ai32[i] > 0 */
13900 puDst->ai32[i] = uSrc1.ai32[i];
13901 }
13902
13903 RT_NOREF(pFpuState);
13904}
13905
13906
13907IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13908{
13909 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13910 {
13911 if (puSrc2->ai8[i] < 0)
13912 puDst->ai8[i] = -puSrc1->ai8[i];
13913 else if (puSrc2->ai8[i] == 0)
13914 puDst->ai8[i] = 0;
13915 else /* puSrc2->ai8[i] > 0 */
13916 puDst->ai8[i] = puSrc1->ai8[i];
13917 }
13918}
13919
13920
13921IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13922{
13923 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13924 {
13925 if (puSrc2->ai8[i] < 0)
13926 puDst->ai8[i] = -puSrc1->ai8[i];
13927 else if (puSrc2->ai8[i] == 0)
13928 puDst->ai8[i] = 0;
13929 else /* puSrc2->ai8[i] > 0 */
13930 puDst->ai8[i] = puSrc1->ai8[i];
13931 }
13932}
13933
13934
13935IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13936{
13937 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13938 {
13939 if (puSrc2->ai16[i] < 0)
13940 puDst->ai16[i] = -puSrc1->ai16[i];
13941 else if (puSrc2->ai16[i] == 0)
13942 puDst->ai16[i] = 0;
13943 else /* puSrc2->ai16[i] > 0 */
13944 puDst->ai16[i] = puSrc1->ai16[i];
13945 }
13946}
13947
13948
13949IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13950{
13951 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13952 {
13953 if (puSrc2->ai16[i] < 0)
13954 puDst->ai16[i] = -puSrc1->ai16[i];
13955 else if (puSrc2->ai16[i] == 0)
13956 puDst->ai16[i] = 0;
13957 else /* puSrc2->ai16[i] > 0 */
13958 puDst->ai16[i] = puSrc1->ai16[i];
13959 }
13960}
13961
13962
13963IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13964{
13965 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13966 {
13967 if (puSrc2->ai32[i] < 0)
13968 puDst->ai32[i] = -puSrc1->ai32[i];
13969 else if (puSrc2->ai32[i] == 0)
13970 puDst->ai32[i] = 0;
13971 else /* puSrc2->ai32[i] > 0 */
13972 puDst->ai32[i] = puSrc1->ai32[i];
13973 }
13974}
13975
13976
13977IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13978{
13979 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13980 {
13981 if (puSrc2->ai32[i] < 0)
13982 puDst->ai32[i] = -puSrc1->ai32[i];
13983 else if (puSrc2->ai32[i] == 0)
13984 puDst->ai32[i] = 0;
13985 else /* puSrc2->ai32[i] > 0 */
13986 puDst->ai32[i] = puSrc1->ai32[i];
13987 }
13988}
13989
13990
13991/*
13992 * PHADDW / VPHADDW / PHADDD / VPHADDD
13993 */
13994IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13995{
13996 RTUINT64U uSrc1 = { *puDst };
13997 RTUINT64U uSrc2 = { *puSrc };
13998 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13999
14000 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14001 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14002 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
14003 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
14004 *puDst = uDst.u;
14005 RT_NOREF(pFpuState);
14006}
14007
14008
14009IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14010{
14011 RTUINT128U uSrc1 = *puDst;
14012
14013 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14014 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14015 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
14016 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
14017
14018 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
14019 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
14020 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
14021 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
14022 RT_NOREF(pFpuState);
14023}
14024
14025
14026IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14027{
14028 RTUINT64U uSrc1 = { *puDst };
14029 RTUINT64U uSrc2 = { *puSrc };
14030 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14031
14032 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14033 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
14034 *puDst = uDst.u;
14035 RT_NOREF(pFpuState);
14036}
14037
14038
14039IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14040{
14041 RTUINT128U uSrc1 = *puDst;
14042
14043 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14044 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
14045
14046 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
14047 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
14048 RT_NOREF(pFpuState);
14049}
14050
14051
14052IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14053{
14054 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14055
14056 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
14057 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
14058 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
14059 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
14060
14061 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
14062 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
14063 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
14064 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
14065
14066 puDst->au64[0] = uDst.au64[0];
14067 puDst->au64[1] = uDst.au64[1];
14068}
14069
14070
14071IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14072{
14073 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14074
14075 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
14076 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
14077 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
14078 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
14079 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
14080 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
14081 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
14082 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
14083
14084 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
14085 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
14086 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
14087 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
14088 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
14089 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
14090 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
14091 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
14092
14093 puDst->au64[0] = uDst.au64[0];
14094 puDst->au64[1] = uDst.au64[1];
14095 puDst->au64[2] = uDst.au64[2];
14096 puDst->au64[3] = uDst.au64[3];
14097}
14098
14099
14100IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14101{
14102 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14103
14104 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
14105 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
14106
14107 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
14108 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
14109
14110 puDst->au64[0] = uDst.au64[0];
14111 puDst->au64[1] = uDst.au64[1];
14112}
14113
14114
14115IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14116{
14117 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14118
14119 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
14120 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
14121 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
14122 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
14123
14124 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
14125 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
14126 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
14127 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
14128
14129 puDst->au64[0] = uDst.au64[0];
14130 puDst->au64[1] = uDst.au64[1];
14131 puDst->au64[2] = uDst.au64[2];
14132 puDst->au64[3] = uDst.au64[3];
14133}
14134
14135
14136/*
14137 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
14138 */
14139IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14140{
14141 RTUINT64U uSrc1 = { *puDst };
14142 RTUINT64U uSrc2 = { *puSrc };
14143 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14144
14145 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14146 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14147 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
14148 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
14149 *puDst = uDst.u;
14150 RT_NOREF(pFpuState);
14151}
14152
14153
14154IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14155{
14156 RTUINT128U uSrc1 = *puDst;
14157
14158 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14159 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14160 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
14161 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
14162
14163 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
14164 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
14165 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
14166 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
14167 RT_NOREF(pFpuState);
14168}
14169
14170
14171IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14172{
14173 RTUINT64U uSrc1 = { *puDst };
14174 RTUINT64U uSrc2 = { *puSrc };
14175 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14176
14177 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14178 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
14179 *puDst = uDst.u;
14180 RT_NOREF(pFpuState);
14181}
14182
14183
14184IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14185{
14186 RTUINT128U uSrc1 = *puDst;
14187
14188 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14189 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
14190
14191 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
14192 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
14193 RT_NOREF(pFpuState);
14194}
14195
14196
14197IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14198{
14199 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14200
14201 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
14202 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
14203 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
14204 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
14205
14206 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
14207 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
14208 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
14209 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
14210
14211 puDst->au64[0] = uDst.au64[0];
14212 puDst->au64[1] = uDst.au64[1];
14213}
14214
14215
14216IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14217{
14218 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14219
14220 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
14221 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
14222 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
14223 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
14224 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
14225 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
14226 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
14227 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
14228
14229 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
14230 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
14231 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
14232 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
14233 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
14234 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
14235 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
14236 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
14237
14238 puDst->au64[0] = uDst.au64[0];
14239 puDst->au64[1] = uDst.au64[1];
14240 puDst->au64[2] = uDst.au64[2];
14241 puDst->au64[3] = uDst.au64[3];
14242}
14243
14244
14245IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14246{
14247 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14248
14249 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
14250 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
14251
14252 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
14253 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
14254
14255 puDst->au64[0] = uDst.au64[0];
14256 puDst->au64[1] = uDst.au64[1];
14257}
14258
14259
14260IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14261{
14262 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14263
14264 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
14265 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
14266 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
14267 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
14268
14269 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
14270 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
14271 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
14272 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
14273
14274 puDst->au64[0] = uDst.au64[0];
14275 puDst->au64[1] = uDst.au64[1];
14276 puDst->au64[2] = uDst.au64[2];
14277 puDst->au64[3] = uDst.au64[3];
14278}
14279
14280
14281/*
14282 * PHADDSW / VPHADDSW
14283 */
14284IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14285{
14286 RTUINT64U uSrc1 = { *puDst };
14287 RTUINT64U uSrc2 = { *puSrc };
14288 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14289
14290 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14291 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14292 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
14293 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
14294 *puDst = uDst.u;
14295 RT_NOREF(pFpuState);
14296}
14297
14298
14299IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14300{
14301 RTUINT128U uSrc1 = *puDst;
14302
14303 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14304 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14305 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
14306 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
14307
14308 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
14309 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
14310 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
14311 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
14312 RT_NOREF(pFpuState);
14313}
14314
14315
14316IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14317{
14318 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14319
14320 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
14321 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
14322 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
14323 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
14324
14325 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
14326 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
14327 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
14328 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
14329
14330 puDst->au64[0] = uDst.au64[0];
14331 puDst->au64[1] = uDst.au64[1];
14332}
14333
14334
14335IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14336{
14337 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14338
14339 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
14340 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
14341 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
14342 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
14343 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
14344 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
14345 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
14346 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
14347
14348 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
14349 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
14350 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
14351 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
14352 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
14353 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
14354 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
14355 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
14356
14357 puDst->au64[0] = uDst.au64[0];
14358 puDst->au64[1] = uDst.au64[1];
14359 puDst->au64[2] = uDst.au64[2];
14360 puDst->au64[3] = uDst.au64[3];
14361}
14362
14363
14364/*
14365 * PHSUBSW / VPHSUBSW
14366 */
14367IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14368{
14369 RTUINT64U uSrc1 = { *puDst };
14370 RTUINT64U uSrc2 = { *puSrc };
14371 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14372
14373 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14374 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14375 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
14376 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
14377 *puDst = uDst.u;
14378 RT_NOREF(pFpuState);
14379}
14380
14381
14382IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14383{
14384 RTUINT128U uSrc1 = *puDst;
14385
14386 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14387 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14388 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
14389 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
14390
14391 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
14392 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
14393 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
14394 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
14395 RT_NOREF(pFpuState);
14396}
14397
14398
14399IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14400{
14401 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14402
14403 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
14404 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
14405 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
14406 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
14407
14408 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
14409 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
14410 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
14411 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
14412
14413 puDst->au64[0] = uDst.au64[0];
14414 puDst->au64[1] = uDst.au64[1];
14415}
14416
14417
14418IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14419{
14420 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14421
14422 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
14423 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
14424 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
14425 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
14426 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
14427 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
14428 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
14429 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
14430
14431 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
14432 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
14433 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
14434 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
14435 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
14436 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
14437 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
14438 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
14439
14440 puDst->au64[0] = uDst.au64[0];
14441 puDst->au64[1] = uDst.au64[1];
14442 puDst->au64[2] = uDst.au64[2];
14443 puDst->au64[3] = uDst.au64[3];
14444}
14445
14446
14447/*
14448 * PMADDUBSW / VPMADDUBSW
14449 */
14450IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14451{
14452 RTUINT64U uSrc1 = { *puDst };
14453 RTUINT64U uSrc2 = { *puSrc };
14454 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14455
14456 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14457 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14458 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14459 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14460 *puDst = uDst.u;
14461 RT_NOREF(pFpuState);
14462}
14463
14464
14465IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14466{
14467 RTUINT128U uSrc1 = *puDst;
14468
14469 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14470 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14471 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14472 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14473 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14474 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14475 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14476 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14477 RT_NOREF(pFpuState);
14478}
14479
14480
14481IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14482{
14483 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14484
14485 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14486 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14487 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14488 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14489 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14490 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14491 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14492 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14493
14494 puDst->au64[0] = uDst.au64[0];
14495 puDst->au64[1] = uDst.au64[1];
14496}
14497
14498
14499IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14500{
14501 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14502
14503 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14504 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14505 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14506 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14507 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14508 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14509 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14510 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14511 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14512 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14513 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14514 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14515 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14516 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14517 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14518 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14519
14520 puDst->au64[0] = uDst.au64[0];
14521 puDst->au64[1] = uDst.au64[1];
14522 puDst->au64[2] = uDst.au64[2];
14523 puDst->au64[3] = uDst.au64[3];
14524}
14525
14526
14527/*
14528 * PMULHRSW / VPMULHRSW
14529 */
14530#define DO_PMULHRSW(a_Src1, a_Src2) \
14531 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14532
14533IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14534{
14535 RTUINT64U uSrc1 = { *puDst };
14536 RTUINT64U uSrc2 = { *puSrc };
14537 RTUINT64U uDst;
14538
14539 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14540 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14541 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14542 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14543 *puDst = uDst.u;
14544 RT_NOREF(pFpuState);
14545}
14546
14547
14548IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14549{
14550 RTUINT128U uSrc1 = *puDst;
14551
14552 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14553 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14554 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14555 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14556 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14557 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14558 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14559 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14560 RT_NOREF(pFpuState);
14561}
14562
14563
14564IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14565{
14566 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14567
14568 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14569 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14570 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14571 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14572 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14573 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14574 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14575 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14576
14577 puDst->au64[0] = uDst.au64[0];
14578 puDst->au64[1] = uDst.au64[1];
14579}
14580
14581
14582IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14583{
14584 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14585
14586 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14587 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14588 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14589 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14590 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14591 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14592 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14593 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14594 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14595 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14596 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14597 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14598 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14599 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14600 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14601 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14602
14603 puDst->au64[0] = uDst.au64[0];
14604 puDst->au64[1] = uDst.au64[1];
14605 puDst->au64[2] = uDst.au64[2];
14606 puDst->au64[3] = uDst.au64[3];
14607}
14608
14609
14610/*
14611 * PSADBW / VPSADBW
14612 */
14613#ifdef IEM_WITHOUT_ASSEMBLY
14614
14615IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14616{
14617 RTUINT64U uSrc1 = { *puDst };
14618 RTUINT64U uSrc2 = { *puSrc };
14619 RTUINT64U uDst;
14620 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14621 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14622 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14623 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14624 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14625 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14626 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14627 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14628
14629 uDst.au64[0] = 0;
14630 uDst.au16[0] = uSum;
14631 *puDst = uDst.u;
14632}
14633
14634
14635IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14636{
14637 RTUINT128U uSrc1 = *puDst;
14638
14639 puDst->au64[0] = 0;
14640 puDst->au64[1] = 0;
14641
14642 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14643 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14644 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14645 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14646 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14647 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14648 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14649 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14650 puDst->au16[0] = uSum;
14651
14652 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14653 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14654 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14655 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14656 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14657 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14658 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14659 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14660 puDst->au16[4] = uSum;
14661}
14662
14663#endif
14664
14665IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14666{
14667 RTUINT128U uSrc1 = *puSrc1;
14668 RTUINT128U uSrc2 = *puSrc2;
14669
14670 puDst->au64[0] = 0;
14671 puDst->au64[1] = 0;
14672
14673 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14674 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14675 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14676 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14677 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14678 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14679 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14680 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14681 puDst->au16[0] = uSum;
14682
14683 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14684 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14685 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14686 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14687 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14688 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14689 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14690 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14691 puDst->au16[4] = uSum;
14692}
14693
14694IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14695{
14696 RTUINT256U uSrc1 = *puSrc1;
14697 RTUINT256U uSrc2 = *puSrc2;
14698
14699 puDst->au64[0] = 0;
14700 puDst->au64[1] = 0;
14701 puDst->au64[2] = 0;
14702 puDst->au64[3] = 0;
14703
14704 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14705 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14706 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14707 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14708 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14709 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14710 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14711 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14712 puDst->au16[0] = uSum;
14713
14714 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14715 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14716 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14717 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14718 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14719 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14720 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14721 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14722 puDst->au16[4] = uSum;
14723
14724 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14725 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14726 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14727 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14728 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14729 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14730 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14731 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14732 puDst->au16[8] = uSum;
14733
14734 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14735 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14736 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14737 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14738 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14739 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14740 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14741 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14742 puDst->au16[12] = uSum;
14743}
14744
14745
14746/*
14747 * PMULDQ / VPMULDQ
14748 */
14749IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14750{
14751 RTUINT128U uSrc1 = *puDst;
14752
14753 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14754 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14755}
14756
14757IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14758{
14759 RTUINT128U uSrc1 = *puSrc1;
14760 RTUINT128U uSrc2 = *puSrc2;
14761
14762 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14763 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14764}
14765
14766IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14767{
14768 RTUINT256U uSrc1 = *puSrc1;
14769 RTUINT256U uSrc2 = *puSrc2;
14770
14771 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14772 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14773 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14774 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14775}
14776
14777
14778/*
14779 * PMULUDQ / VPMULUDQ
14780 */
14781#ifdef IEM_WITHOUT_ASSEMBLY
14782
14783IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14784{
14785 RTUINT64U uSrc1 = { *puDst };
14786 RTUINT64U uSrc2 = { *puSrc };
14787 ASMCompilerBarrier();
14788 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14789 RT_NOREF(pFpuState);
14790}
14791
14792
14793IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14794{
14795 RTUINT128U uSrc1 = *puDst;
14796 RTUINT128U uSrc2 = *puSrc;
14797 ASMCompilerBarrier();
14798 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14799 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14800 RT_NOREF(pFpuState);
14801}
14802
14803#endif
14804
14805IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14806{
14807 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14808 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14809 ASMCompilerBarrier();
14810 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14811 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14812}
14813
14814
14815IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14816{
14817 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14818 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14819 ASMCompilerBarrier();
14820 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14821 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14822 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14823 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14824}
14825
14826
14827/*
14828 * UNPCKLPS / VUNPCKLPS
14829 */
14830#ifdef IEM_WITHOUT_ASSEMBLY
14831IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14832{
14833 RTUINT128U uSrc1 = *puDst;
14834 RTUINT128U uSrc2 = *puSrc;
14835 ASMCompilerBarrier();
14836 puDst->au32[0] = uSrc1.au32[0];
14837 puDst->au32[1] = uSrc2.au32[0];
14838 puDst->au32[2] = uSrc1.au32[1];
14839 puDst->au32[3] = uSrc2.au32[1];
14840}
14841
14842#endif
14843
14844IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14845{
14846 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14847 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14848 ASMCompilerBarrier();
14849 puDst->au32[0] = uSrc1.au32[0];
14850 puDst->au32[1] = uSrc2.au32[0];
14851 puDst->au32[2] = uSrc1.au32[1];
14852 puDst->au32[3] = uSrc2.au32[1];
14853}
14854
14855
14856IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14857{
14858 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14859 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14860 ASMCompilerBarrier();
14861 puDst->au32[0] = uSrc1.au32[0];
14862 puDst->au32[1] = uSrc2.au32[0];
14863 puDst->au32[2] = uSrc1.au32[1];
14864 puDst->au32[3] = uSrc2.au32[1];
14865
14866 puDst->au32[4] = uSrc1.au32[4];
14867 puDst->au32[5] = uSrc2.au32[4];
14868 puDst->au32[6] = uSrc1.au32[5];
14869 puDst->au32[7] = uSrc2.au32[5];
14870}
14871
14872
14873/*
14874 * UNPCKLPD / VUNPCKLPD
14875 */
14876#ifdef IEM_WITHOUT_ASSEMBLY
14877IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14878{
14879 RTUINT128U uSrc1 = *puDst;
14880 RTUINT128U uSrc2 = *puSrc;
14881 ASMCompilerBarrier();
14882 puDst->au64[0] = uSrc1.au64[0];
14883 puDst->au64[1] = uSrc2.au64[0];
14884}
14885
14886#endif
14887
14888IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14889{
14890 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14891 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14892 ASMCompilerBarrier();
14893 puDst->au64[0] = uSrc1.au64[0];
14894 puDst->au64[1] = uSrc2.au64[0];
14895}
14896
14897
14898IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14899{
14900 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14901 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14902 ASMCompilerBarrier();
14903 puDst->au64[0] = uSrc1.au64[0];
14904 puDst->au64[1] = uSrc2.au64[0];
14905 puDst->au64[2] = uSrc1.au64[2];
14906 puDst->au64[3] = uSrc2.au64[2];
14907}
14908
14909
14910/*
14911 * UNPCKHPS / VUNPCKHPS
14912 */
14913#ifdef IEM_WITHOUT_ASSEMBLY
14914IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14915{
14916 RTUINT128U uSrc1 = *puDst;
14917 RTUINT128U uSrc2 = *puSrc;
14918 ASMCompilerBarrier();
14919 puDst->au32[0] = uSrc1.au32[2];
14920 puDst->au32[1] = uSrc2.au32[2];
14921 puDst->au32[2] = uSrc1.au32[3];
14922 puDst->au32[3] = uSrc2.au32[3];
14923}
14924
14925#endif
14926
14927IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14928{
14929 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14930 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14931 ASMCompilerBarrier();
14932 puDst->au32[0] = uSrc1.au32[2];
14933 puDst->au32[1] = uSrc2.au32[2];
14934 puDst->au32[2] = uSrc1.au32[3];
14935 puDst->au32[3] = uSrc2.au32[3];
14936}
14937
14938
14939IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14940{
14941 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14942 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14943 ASMCompilerBarrier();
14944 puDst->au32[0] = uSrc1.au32[2];
14945 puDst->au32[1] = uSrc2.au32[2];
14946 puDst->au32[2] = uSrc1.au32[3];
14947 puDst->au32[3] = uSrc2.au32[3];
14948
14949 puDst->au32[4] = uSrc1.au32[6];
14950 puDst->au32[5] = uSrc2.au32[6];
14951 puDst->au32[6] = uSrc1.au32[7];
14952 puDst->au32[7] = uSrc2.au32[7];
14953}
14954
14955
14956/*
14957 * UNPCKHPD / VUNPCKHPD
14958 */
14959#ifdef IEM_WITHOUT_ASSEMBLY
14960IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14961{
14962 RTUINT128U uSrc1 = *puDst;
14963 RTUINT128U uSrc2 = *puSrc;
14964 ASMCompilerBarrier();
14965 puDst->au64[0] = uSrc1.au64[1];
14966 puDst->au64[1] = uSrc2.au64[1];
14967}
14968
14969#endif
14970
14971IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14972{
14973 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14974 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14975 ASMCompilerBarrier();
14976 puDst->au64[0] = uSrc1.au64[1];
14977 puDst->au64[1] = uSrc2.au64[1];
14978}
14979
14980
14981IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14982{
14983 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14984 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14985 ASMCompilerBarrier();
14986 puDst->au64[0] = uSrc1.au64[1];
14987 puDst->au64[1] = uSrc2.au64[1];
14988 puDst->au64[2] = uSrc1.au64[3];
14989 puDst->au64[3] = uSrc2.au64[3];
14990}
14991
14992
14993/*
14994 * CRC32 (SEE 4.2).
14995 */
14996
14997IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14998{
14999 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15000}
15001
15002
15003IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
15004{
15005 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15006}
15007
15008IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
15009{
15010 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15011}
15012
15013IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
15014{
15015 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15016}
15017
15018
15019/*
15020 * PTEST (SSE 4.1) - special as it output only EFLAGS.
15021 */
15022#ifdef IEM_WITHOUT_ASSEMBLY
15023IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15024{
15025 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15026 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15027 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15028 fEfl |= X86_EFL_ZF;
15029 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15030 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15031 fEfl |= X86_EFL_CF;
15032 *pfEFlags = fEfl;
15033}
15034#endif
15035
15036IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15037{
15038 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15039 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15040 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
15041 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
15042 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15043 fEfl |= X86_EFL_ZF;
15044 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15045 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
15046 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
15047 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15048 fEfl |= X86_EFL_CF;
15049 *pfEFlags = fEfl;
15050}
15051
15052
15053/*
15054 * PMOVSXBW / VPMOVSXBW
15055 */
15056IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15057{
15058 RTUINT64U uSrc1 = { uSrc };
15059 puDst->ai16[0] = uSrc1.ai8[0];
15060 puDst->ai16[1] = uSrc1.ai8[1];
15061 puDst->ai16[2] = uSrc1.ai8[2];
15062 puDst->ai16[3] = uSrc1.ai8[3];
15063 puDst->ai16[4] = uSrc1.ai8[4];
15064 puDst->ai16[5] = uSrc1.ai8[5];
15065 puDst->ai16[6] = uSrc1.ai8[6];
15066 puDst->ai16[7] = uSrc1.ai8[7];
15067}
15068
15069
15070IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15071{
15072 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15073 puDst->ai16[ 0] = uSrc1.ai8[ 0];
15074 puDst->ai16[ 1] = uSrc1.ai8[ 1];
15075 puDst->ai16[ 2] = uSrc1.ai8[ 2];
15076 puDst->ai16[ 3] = uSrc1.ai8[ 3];
15077 puDst->ai16[ 4] = uSrc1.ai8[ 4];
15078 puDst->ai16[ 5] = uSrc1.ai8[ 5];
15079 puDst->ai16[ 6] = uSrc1.ai8[ 6];
15080 puDst->ai16[ 7] = uSrc1.ai8[ 7];
15081 puDst->ai16[ 8] = uSrc1.ai8[ 8];
15082 puDst->ai16[ 9] = uSrc1.ai8[ 9];
15083 puDst->ai16[10] = uSrc1.ai8[10];
15084 puDst->ai16[11] = uSrc1.ai8[11];
15085 puDst->ai16[12] = uSrc1.ai8[12];
15086 puDst->ai16[13] = uSrc1.ai8[13];
15087 puDst->ai16[14] = uSrc1.ai8[14];
15088 puDst->ai16[15] = uSrc1.ai8[15];
15089}
15090
15091
15092/*
15093 * PMOVSXBD / VPMOVSXBD
15094 */
15095IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15096{
15097 RTUINT32U uSrc1 = { uSrc };
15098 puDst->ai32[0] = uSrc1.ai8[0];
15099 puDst->ai32[1] = uSrc1.ai8[1];
15100 puDst->ai32[2] = uSrc1.ai8[2];
15101 puDst->ai32[3] = uSrc1.ai8[3];
15102}
15103
15104
15105IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15106{
15107 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15108 puDst->ai32[0] = uSrc1.ai8[0];
15109 puDst->ai32[1] = uSrc1.ai8[1];
15110 puDst->ai32[2] = uSrc1.ai8[2];
15111 puDst->ai32[3] = uSrc1.ai8[3];
15112 puDst->ai32[4] = uSrc1.ai8[4];
15113 puDst->ai32[5] = uSrc1.ai8[5];
15114 puDst->ai32[6] = uSrc1.ai8[6];
15115 puDst->ai32[7] = uSrc1.ai8[7];
15116}
15117
15118
15119/*
15120 * PMOVSXBQ / VPMOVSXBQ
15121 */
15122IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15123{
15124 RTUINT16U uSrc1 = { uSrc };
15125 puDst->ai64[0] = uSrc1.ai8[0];
15126 puDst->ai64[1] = uSrc1.ai8[1];
15127}
15128
15129
15130IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15131{
15132 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15133 puDst->ai64[0] = uSrc1.ai8[0];
15134 puDst->ai64[1] = uSrc1.ai8[1];
15135 puDst->ai64[2] = uSrc1.ai8[2];
15136 puDst->ai64[3] = uSrc1.ai8[3];
15137}
15138
15139
15140/*
15141 * PMOVSXWD / VPMOVSXWD
15142 */
15143IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15144{
15145 RTUINT64U uSrc1 = { uSrc };
15146 puDst->ai32[0] = uSrc1.ai16[0];
15147 puDst->ai32[1] = uSrc1.ai16[1];
15148 puDst->ai32[2] = uSrc1.ai16[2];
15149 puDst->ai32[3] = uSrc1.ai16[3];
15150}
15151
15152
15153IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15154{
15155 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15156 puDst->ai32[0] = uSrc1.ai16[0];
15157 puDst->ai32[1] = uSrc1.ai16[1];
15158 puDst->ai32[2] = uSrc1.ai16[2];
15159 puDst->ai32[3] = uSrc1.ai16[3];
15160 puDst->ai32[4] = uSrc1.ai16[4];
15161 puDst->ai32[5] = uSrc1.ai16[5];
15162 puDst->ai32[6] = uSrc1.ai16[6];
15163 puDst->ai32[7] = uSrc1.ai16[7];
15164}
15165
15166
15167/*
15168 * PMOVSXWQ / VPMOVSXWQ
15169 */
15170IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15171{
15172 RTUINT32U uSrc1 = { uSrc };
15173 puDst->ai64[0] = uSrc1.ai16[0];
15174 puDst->ai64[1] = uSrc1.ai16[1];
15175}
15176
15177
15178IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15179{
15180 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15181 puDst->ai64[0] = uSrc1.ai16[0];
15182 puDst->ai64[1] = uSrc1.ai16[1];
15183 puDst->ai64[2] = uSrc1.ai16[2];
15184 puDst->ai64[3] = uSrc1.ai16[3];
15185}
15186
15187
15188/*
15189 * PMOVSXDQ / VPMOVSXDQ
15190 */
15191IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15192{
15193 RTUINT64U uSrc1 = { uSrc };
15194 puDst->ai64[0] = uSrc1.ai32[0];
15195 puDst->ai64[1] = uSrc1.ai32[1];
15196}
15197
15198
15199IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15200{
15201 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15202 puDst->ai64[0] = uSrc1.ai32[0];
15203 puDst->ai64[1] = uSrc1.ai32[1];
15204 puDst->ai64[2] = uSrc1.ai32[2];
15205 puDst->ai64[3] = uSrc1.ai32[3];
15206}
15207
15208
15209/*
15210 * PMOVZXBW / VPMOVZXBW
15211 */
15212IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15213{
15214 RTUINT64U uSrc1 = { uSrc };
15215 puDst->au16[0] = uSrc1.au8[0];
15216 puDst->au16[1] = uSrc1.au8[1];
15217 puDst->au16[2] = uSrc1.au8[2];
15218 puDst->au16[3] = uSrc1.au8[3];
15219 puDst->au16[4] = uSrc1.au8[4];
15220 puDst->au16[5] = uSrc1.au8[5];
15221 puDst->au16[6] = uSrc1.au8[6];
15222 puDst->au16[7] = uSrc1.au8[7];
15223}
15224
15225
15226IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15227{
15228 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15229 puDst->au16[ 0] = uSrc1.au8[ 0];
15230 puDst->au16[ 1] = uSrc1.au8[ 1];
15231 puDst->au16[ 2] = uSrc1.au8[ 2];
15232 puDst->au16[ 3] = uSrc1.au8[ 3];
15233 puDst->au16[ 4] = uSrc1.au8[ 4];
15234 puDst->au16[ 5] = uSrc1.au8[ 5];
15235 puDst->au16[ 6] = uSrc1.au8[ 6];
15236 puDst->au16[ 7] = uSrc1.au8[ 7];
15237 puDst->au16[ 8] = uSrc1.au8[ 8];
15238 puDst->au16[ 9] = uSrc1.au8[ 9];
15239 puDst->au16[10] = uSrc1.au8[10];
15240 puDst->au16[11] = uSrc1.au8[11];
15241 puDst->au16[12] = uSrc1.au8[12];
15242 puDst->au16[13] = uSrc1.au8[13];
15243 puDst->au16[14] = uSrc1.au8[14];
15244 puDst->au16[15] = uSrc1.au8[15];
15245}
15246
15247
15248/*
15249 * PMOVZXBD / VPMOVZXBD
15250 */
15251IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15252{
15253 RTUINT32U uSrc1 = { uSrc };
15254 puDst->au32[0] = uSrc1.au8[0];
15255 puDst->au32[1] = uSrc1.au8[1];
15256 puDst->au32[2] = uSrc1.au8[2];
15257 puDst->au32[3] = uSrc1.au8[3];
15258}
15259
15260
15261IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15262{
15263 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15264 puDst->au32[0] = uSrc1.au8[0];
15265 puDst->au32[1] = uSrc1.au8[1];
15266 puDst->au32[2] = uSrc1.au8[2];
15267 puDst->au32[3] = uSrc1.au8[3];
15268 puDst->au32[4] = uSrc1.au8[4];
15269 puDst->au32[5] = uSrc1.au8[5];
15270 puDst->au32[6] = uSrc1.au8[6];
15271 puDst->au32[7] = uSrc1.au8[7];
15272}
15273
15274
15275/*
15276 * PMOVZXBQ / VPMOVZXBQ
15277 */
15278IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15279{
15280 RTUINT16U uSrc1 = { uSrc };
15281 puDst->au64[0] = uSrc1.au8[0];
15282 puDst->au64[1] = uSrc1.au8[1];
15283}
15284
15285
15286IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15287{
15288 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15289 puDst->au64[0] = uSrc1.au8[0];
15290 puDst->au64[1] = uSrc1.au8[1];
15291 puDst->au64[2] = uSrc1.au8[2];
15292 puDst->au64[3] = uSrc1.au8[3];
15293}
15294
15295
15296/*
15297 * PMOVZXWD / VPMOVZXWD
15298 */
15299IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15300{
15301 RTUINT64U uSrc1 = { uSrc };
15302 puDst->au32[0] = uSrc1.au16[0];
15303 puDst->au32[1] = uSrc1.au16[1];
15304 puDst->au32[2] = uSrc1.au16[2];
15305 puDst->au32[3] = uSrc1.au16[3];
15306}
15307
15308
15309IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15310{
15311 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15312 puDst->au32[0] = uSrc1.au16[0];
15313 puDst->au32[1] = uSrc1.au16[1];
15314 puDst->au32[2] = uSrc1.au16[2];
15315 puDst->au32[3] = uSrc1.au16[3];
15316 puDst->au32[4] = uSrc1.au16[4];
15317 puDst->au32[5] = uSrc1.au16[5];
15318 puDst->au32[6] = uSrc1.au16[6];
15319 puDst->au32[7] = uSrc1.au16[7];
15320}
15321
15322
15323/*
15324 * PMOVZXWQ / VPMOVZXWQ
15325 */
15326IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15327{
15328 RTUINT32U uSrc1 = { uSrc };
15329 puDst->au64[0] = uSrc1.au16[0];
15330 puDst->au64[1] = uSrc1.au16[1];
15331}
15332
15333
15334IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15335{
15336 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15337 puDst->au64[0] = uSrc1.au16[0];
15338 puDst->au64[1] = uSrc1.au16[1];
15339 puDst->au64[2] = uSrc1.au16[2];
15340 puDst->au64[3] = uSrc1.au16[3];
15341}
15342
15343
15344/*
15345 * PMOVZXDQ / VPMOVZXDQ
15346 */
15347IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15348{
15349 RTUINT64U uSrc1 = { uSrc };
15350 puDst->au64[0] = uSrc1.au32[0];
15351 puDst->au64[1] = uSrc1.au32[1];
15352}
15353
15354
15355IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15356{
15357 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15358 puDst->au64[0] = uSrc1.au32[0];
15359 puDst->au64[1] = uSrc1.au32[1];
15360 puDst->au64[2] = uSrc1.au32[2];
15361 puDst->au64[3] = uSrc1.au32[3];
15362}
15363
15364/**
15365 * Converts from the packed IPRT 32-bit (single precision) floating point format to
15366 * the SoftFloat 32-bit floating point format (float32_t).
15367 *
15368 * This is only a structure format conversion, nothing else.
15369 */
15370DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
15371{
15372 float32_t Tmp;
15373 Tmp.v = pr32Val->u;
15374 return Tmp;
15375}
15376
15377
15378/**
15379 * Converts from SoftFloat 32-bit floating point format (float32_t)
15380 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
15381 *
15382 * This is only a structure format conversion, nothing else.
15383 */
15384DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
15385{
15386 pr32Dst->u = r32XSrc.v;
15387 return pr32Dst;
15388}
15389
15390
15391/**
15392 * Converts from the packed IPRT 64-bit (single precision) floating point format to
15393 * the SoftFloat 64-bit floating point format (float64_t).
15394 *
15395 * This is only a structure format conversion, nothing else.
15396 */
15397DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
15398{
15399 float64_t Tmp;
15400 Tmp.v = pr64Val->u;
15401 return Tmp;
15402}
15403
15404
15405/**
15406 * Converts from SoftFloat 64-bit floating point format (float64_t)
15407 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
15408 *
15409 * This is only a structure format conversion, nothing else.
15410 */
15411DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
15412{
15413 pr64Dst->u = r64XSrc.v;
15414 return pr64Dst;
15415}
15416
15417
15418/** Initializer for the SoftFloat state structure. */
15419# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
15420 { \
15421 softfloat_tininess_afterRounding, \
15422 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
15423 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
15424 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
15425 : (uint8_t)softfloat_round_minMag, \
15426 0, \
15427 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
15428 32 /* Rounding precision, not relevant for SIMD. */ \
15429 }
15430
15431#ifdef IEM_WITHOUT_ASSEMBLY
15432
15433/**
15434 * Helper for transfering exception to MXCSR and setting the result value
15435 * accordingly.
15436 *
15437 * @returns Updated MXCSR.
15438 * @param pSoftState The SoftFloat state following the operation.
15439 * @param r32Result The result of the SoftFloat operation.
15440 * @param pr32Result Where to store the result for IEM.
15441 * @param fMxcsr The original MXCSR value.
15442 */
15443DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
15444 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15445{
15446 iemFpSoftF32ToIprt(pr32Result, r32Result);
15447
15448 uint8_t fXcpt = pSoftState->exceptionFlags;
15449 if ( (fMxcsr & X86_MXCSR_FZ)
15450 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
15451 {
15452 /* Underflow masked and flush to zero is set. */
15453 pr32Result->s.uFraction = 0;
15454 pr32Result->s.uExponent = 0;
15455 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15456 }
15457
15458 /* If DAZ is set \#DE is never set. */
15459 if ( fMxcsr & X86_MXCSR_DAZ
15460 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15461 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15462 fXcpt &= ~X86_MXCSR_DE;
15463
15464 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15465}
15466
15467
15468/**
15469 * Helper for transfering exception to MXCSR and setting the result value
15470 * accordingly - ignores Flush-to-Zero.
15471 *
15472 * @returns Updated MXCSR.
15473 * @param pSoftState The SoftFloat state following the operation.
15474 * @param r32Result The result of the SoftFloat operation.
15475 * @param pr32Result Where to store the result for IEM.
15476 * @param fMxcsr The original MXCSR value.
15477 */
15478DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15479 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15480{
15481 iemFpSoftF32ToIprt(pr32Result, r32Result);
15482
15483 uint8_t fXcpt = pSoftState->exceptionFlags;
15484 /* If DAZ is set \#DE is never set. */
15485 if ( fMxcsr & X86_MXCSR_DAZ
15486 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15487 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15488 fXcpt &= ~X86_MXCSR_DE;
15489
15490 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15491}
15492
15493
15494/**
15495 * Helper for transfering exception to MXCSR and setting the result value
15496 * accordingly.
15497 *
15498 * @returns Updated MXCSR.
15499 * @param pSoftState The SoftFloat state following the operation.
15500 * @param r64Result The result of the SoftFloat operation.
15501 * @param pr64Result Where to store the result for IEM.
15502 * @param fMxcsr The original MXCSR value.
15503 */
15504DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15505 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15506{
15507 iemFpSoftF64ToIprt(pr64Result, r64Result);
15508 uint8_t fXcpt = pSoftState->exceptionFlags;
15509 if ( (fMxcsr & X86_MXCSR_FZ)
15510 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15511 {
15512 /* Underflow masked and flush to zero is set. */
15513 iemFpSoftF64ToIprt(pr64Result, r64Result);
15514 pr64Result->s.uFractionHigh = 0;
15515 pr64Result->s.uFractionLow = 0;
15516 pr64Result->s.uExponent = 0;
15517 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15518 }
15519
15520 /* If DAZ is set \#DE is never set. */
15521 if ( fMxcsr & X86_MXCSR_DAZ
15522 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15523 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15524 fXcpt &= ~X86_MXCSR_DE;
15525
15526 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15527}
15528
15529
15530/**
15531 * Helper for transfering exception to MXCSR and setting the result value
15532 * accordingly - ignores Flush-to-Zero.
15533 *
15534 * @returns Updated MXCSR.
15535 * @param pSoftState The SoftFloat state following the operation.
15536 * @param r64Result The result of the SoftFloat operation.
15537 * @param pr64Result Where to store the result for IEM.
15538 * @param fMxcsr The original MXCSR value.
15539 */
15540DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15541 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15542{
15543 iemFpSoftF64ToIprt(pr64Result, r64Result);
15544
15545 uint8_t fXcpt = pSoftState->exceptionFlags;
15546 /* If DAZ is set \#DE is never set. */
15547 if ( fMxcsr & X86_MXCSR_DAZ
15548 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15549 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15550 fXcpt &= ~X86_MXCSR_DE;
15551
15552 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15553}
15554
15555#endif /* IEM_WITHOUT_ASSEMBLY */
15556
15557
15558/**
15559 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15560 * in MXCSR into account.
15561 *
15562 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15563 * @param pr32Val Where to store the result.
15564 * @param fMxcsr The input MXCSR value.
15565 * @param pr32Src The value to use.
15566 */
15567DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15568{
15569 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15570 {
15571 if (fMxcsr & X86_MXCSR_DAZ)
15572 {
15573 /* De-normals are changed to 0. */
15574 pr32Val->s.fSign = pr32Src->s.fSign;
15575 pr32Val->s.uFraction = 0;
15576 pr32Val->s.uExponent = 0;
15577 return 0;
15578 }
15579
15580 *pr32Val = *pr32Src;
15581 return X86_MXCSR_DE;
15582 }
15583
15584 *pr32Val = *pr32Src;
15585 return 0;
15586}
15587
15588
15589/**
15590 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15591 * in MXCSR into account.
15592 *
15593 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15594 * @param pr64Val Where to store the result.
15595 * @param fMxcsr The input MXCSR value.
15596 * @param pr64Src The value to use.
15597 */
15598DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15599{
15600 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15601 {
15602 if (fMxcsr & X86_MXCSR_DAZ)
15603 {
15604 /* De-normals are changed to 0. */
15605 pr64Val->s64.fSign = pr64Src->s.fSign;
15606 pr64Val->s64.uFraction = 0;
15607 pr64Val->s64.uExponent = 0;
15608 return 0;
15609 }
15610
15611 *pr64Val = *pr64Src;
15612 return X86_MXCSR_DE;
15613 }
15614
15615 *pr64Val = *pr64Src;
15616 return 0;
15617}
15618
15619#ifdef IEM_WITHOUT_ASSEMBLY
15620
15621/**
15622 * Validates the given input operands returning whether the operation can continue or whether one
15623 * of the source operands contains a NaN value, setting the output accordingly.
15624 *
15625 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15626 * @param pr32Res Where to store the result in case the operation can't continue.
15627 * @param pr32Val1 The first input operand.
15628 * @param pr32Val2 The second input operand.
15629 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15630 */
15631DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15632{
15633 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15634 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15635 if (cSNan + cQNan == 2)
15636 {
15637 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15638 *pr32Res = *pr32Val1;
15639 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15640 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15641 return true;
15642 }
15643 if (cSNan)
15644 {
15645 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15646 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15647 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15648 *pfMxcsr |= X86_MXCSR_IE;
15649 return true;
15650 }
15651 if (cQNan)
15652 {
15653 /* The QNan operand is placed into the result. */
15654 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15655 return true;
15656 }
15657
15658 Assert(!cQNan && !cSNan);
15659 return false;
15660}
15661
15662
15663/**
15664 * Validates the given double precision input operands returning whether the operation can continue or whether one
15665 * of the source operands contains a NaN value, setting the output accordingly.
15666 *
15667 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15668 * @param pr64Res Where to store the result in case the operation can't continue.
15669 * @param pr64Val1 The first input operand.
15670 * @param pr64Val2 The second input operand.
15671 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15672 */
15673DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15674{
15675 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15676 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15677 if (cSNan + cQNan == 2)
15678 {
15679 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15680 *pr64Res = *pr64Val1;
15681 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15682 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15683 return true;
15684 }
15685 if (cSNan)
15686 {
15687 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15688 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15689 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15690 *pfMxcsr |= X86_MXCSR_IE;
15691 return true;
15692 }
15693 if (cQNan)
15694 {
15695 /* The QNan operand is placed into the result. */
15696 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15697 return true;
15698 }
15699
15700 Assert(!cQNan && !cSNan);
15701 return false;
15702}
15703
15704
15705/**
15706 * Validates the given single input operand returning whether the operation can continue or whether
15707 * contains a NaN value, setting the output accordingly.
15708 *
15709 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15710 * @param pr32Res Where to store the result in case the operation can't continue.
15711 * @param pr32Val The input operand.
15712 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15713 */
15714DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15715{
15716 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15717 {
15718 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15719 *pr32Res = *pr32Val;
15720 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15721 *pfMxcsr |= X86_MXCSR_IE;
15722 return true;
15723 }
15724 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15725 {
15726 /* The QNan operand is placed into the result. */
15727 *pr32Res = *pr32Val;
15728 return true;
15729 }
15730
15731 return false;
15732}
15733
15734
15735/**
15736 * Validates the given double input operand returning whether the operation can continue or whether
15737 * contains a NaN value, setting the output accordingly.
15738 *
15739 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15740 * @param pr64Res Where to store the result in case the operation can't continue.
15741 * @param pr64Val The input operand.
15742 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15743 */
15744DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15745{
15746 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15747 {
15748 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15749 *pr64Res = *pr64Val;
15750 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15751 *pfMxcsr |= X86_MXCSR_IE;
15752 return true;
15753 }
15754 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15755 {
15756 /* The QNan operand is placed into the result. */
15757 *pr64Res = *pr64Val;
15758 return true;
15759 }
15760
15761 return false;
15762}
15763
15764#endif /* IEM_WITHOUT_ASSEMBLY */
15765
15766/**
15767 * ADDPS
15768 */
15769#ifdef IEM_WITHOUT_ASSEMBLY
15770static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15771{
15772 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15773 return fMxcsr;
15774
15775 RTFLOAT32U r32Src1, r32Src2;
15776 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15777 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15778 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15779 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15780 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15781}
15782
15783
15784IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15785{
15786 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15787 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15788 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15789 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15790}
15791#endif
15792
15793
15794/**
15795 * ADDSS
15796 */
15797#ifdef IEM_WITHOUT_ASSEMBLY
15798IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15799{
15800 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15801 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15802 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15803 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15804}
15805#endif
15806
15807
15808/**
15809 * ADDPD
15810 */
15811#ifdef IEM_WITHOUT_ASSEMBLY
15812static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15813{
15814 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15815 return fMxcsr;
15816
15817 RTFLOAT64U r64Src1, r64Src2;
15818 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15819 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15820 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15821 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15822 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15823}
15824
15825
15826IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15827{
15828 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15829 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15830}
15831#endif
15832
15833
15834/**
15835 * ADDSD
15836 */
15837#ifdef IEM_WITHOUT_ASSEMBLY
15838IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15839{
15840 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15841 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15842}
15843#endif
15844
15845
15846/**
15847 * MULPS
15848 */
15849#ifdef IEM_WITHOUT_ASSEMBLY
15850static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15851{
15852 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15853 return fMxcsr;
15854
15855 RTFLOAT32U r32Src1, r32Src2;
15856 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15857 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15858 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15859 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15860 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15861}
15862
15863
15864IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15865{
15866 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15867 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15868 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15869 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15870}
15871#endif
15872
15873
15874/**
15875 * MULSS
15876 */
15877#ifdef IEM_WITHOUT_ASSEMBLY
15878IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15879{
15880 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15881 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15882 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15883 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15884}
15885#endif
15886
15887
15888/**
15889 * MULPD
15890 */
15891#ifdef IEM_WITHOUT_ASSEMBLY
15892static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15893{
15894 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15895 return fMxcsr;
15896
15897 RTFLOAT64U r64Src1, r64Src2;
15898 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15899 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15900 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15901 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15902 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15903}
15904
15905
15906IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15907{
15908 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15909 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15910}
15911#endif
15912
15913
15914/**
15915 * MULSD
15916 */
15917#ifdef IEM_WITHOUT_ASSEMBLY
15918IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15919{
15920 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15921 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15922}
15923#endif
15924
15925
15926/**
15927 * SUBPS
15928 */
15929#ifdef IEM_WITHOUT_ASSEMBLY
15930static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15931{
15932 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15933 return fMxcsr;
15934
15935 RTFLOAT32U r32Src1, r32Src2;
15936 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15937 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15938 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15939 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15940 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15941}
15942
15943
15944IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15945{
15946 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15947 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15948 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15949 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15950}
15951#endif
15952
15953
15954/**
15955 * SUBSS
15956 */
15957#ifdef IEM_WITHOUT_ASSEMBLY
15958IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15959{
15960 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15961 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15962 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15963 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15964}
15965#endif
15966
15967
15968/**
15969 * SUBPD
15970 */
15971#ifdef IEM_WITHOUT_ASSEMBLY
15972static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15973{
15974 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15975 return fMxcsr;
15976
15977 RTFLOAT64U r64Src1, r64Src2;
15978 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15979 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15980 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15981 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15982 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15983}
15984
15985
15986IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15987{
15988 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15989 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15990}
15991#endif
15992
15993
15994/**
15995 * SUBSD
15996 */
15997#ifdef IEM_WITHOUT_ASSEMBLY
15998IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15999{
16000 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16001 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16002}
16003#endif
16004
16005
16006/**
16007 * MINPS
16008 */
16009#ifdef IEM_WITHOUT_ASSEMBLY
16010static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16011{
16012 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16013 {
16014 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16015 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16016 return fMxcsr | X86_MXCSR_IE;
16017 }
16018
16019 RTFLOAT32U r32Src1, r32Src2;
16020 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16021 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16022 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16023 {
16024 *pr32Res = r32Src2;
16025 return fMxcsr;
16026 }
16027
16028 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16029 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16030 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16031 fLe
16032 ? iemFpSoftF32FromIprt(&r32Src1)
16033 : iemFpSoftF32FromIprt(&r32Src2),
16034 pr32Res, fMxcsr);
16035}
16036
16037
16038IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16039{
16040 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16041 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16042 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16043 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16044}
16045#endif
16046
16047
16048/**
16049 * MINSS
16050 */
16051#ifdef IEM_WITHOUT_ASSEMBLY
16052IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16053{
16054 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16055 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16056 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16057 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16058}
16059#endif
16060
16061
16062/**
16063 * MINPD
16064 */
16065#ifdef IEM_WITHOUT_ASSEMBLY
16066static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16067{
16068 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16069 {
16070 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16071 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16072 return fMxcsr | X86_MXCSR_IE;
16073 }
16074
16075 RTFLOAT64U r64Src1, r64Src2;
16076 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16077 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16078 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16079 {
16080 *pr64Res = r64Src2;
16081 return fMxcsr;
16082 }
16083
16084 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16085 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16086 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16087 fLe
16088 ? iemFpSoftF64FromIprt(&r64Src1)
16089 : iemFpSoftF64FromIprt(&r64Src2),
16090 pr64Res, fMxcsr);
16091}
16092
16093
16094IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16095{
16096 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16097 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16098}
16099#endif
16100
16101
16102/**
16103 * MINSD
16104 */
16105#ifdef IEM_WITHOUT_ASSEMBLY
16106IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16107{
16108 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16109 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16110}
16111#endif
16112
16113
16114/**
16115 * DIVPS
16116 */
16117#ifdef IEM_WITHOUT_ASSEMBLY
16118static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16119{
16120 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16121 return fMxcsr;
16122
16123 RTFLOAT32U r32Src1, r32Src2;
16124 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16125 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16126 if (RTFLOAT32U_IS_ZERO(&r32Src2))
16127 {
16128 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
16129 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
16130 {
16131 *pr32Res = g_ar32QNaN[1];
16132 return fMxcsr | X86_MXCSR_IE;
16133 }
16134 else if (RTFLOAT32U_IS_INF(&r32Src1))
16135 {
16136 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16137 return fMxcsr;
16138 }
16139 else
16140 {
16141 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16142 return fMxcsr | X86_MXCSR_ZE;
16143 }
16144 }
16145
16146 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16147 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16148 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16149}
16150
16151
16152IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16153{
16154 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16155 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16156 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16157 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16158}
16159#endif
16160
16161
16162/**
16163 * DIVSS
16164 */
16165#ifdef IEM_WITHOUT_ASSEMBLY
16166IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16167{
16168 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16169 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16170 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16171 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16172}
16173#endif
16174
16175
16176/**
16177 * DIVPD
16178 */
16179#ifdef IEM_WITHOUT_ASSEMBLY
16180static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16181{
16182 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16183 return fMxcsr;
16184
16185 RTFLOAT64U r64Src1, r64Src2;
16186 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16187 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16188 if (RTFLOAT64U_IS_ZERO(&r64Src2))
16189 {
16190 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
16191 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
16192 {
16193 *pr64Res = g_ar64QNaN[1];
16194 return fMxcsr | X86_MXCSR_IE;
16195 }
16196 else if (RTFLOAT64U_IS_INF(&r64Src1))
16197 {
16198 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16199 return fMxcsr;
16200 }
16201 else
16202 {
16203 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16204 return fMxcsr | X86_MXCSR_ZE;
16205 }
16206 }
16207
16208 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16209 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16210 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16211}
16212
16213
16214IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16215{
16216 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16217 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16218}
16219#endif
16220
16221
16222/**
16223 * DIVSD
16224 */
16225#ifdef IEM_WITHOUT_ASSEMBLY
16226IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16227{
16228 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16229 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16230}
16231#endif
16232
16233
16234/**
16235 * MAXPS
16236 */
16237#ifdef IEM_WITHOUT_ASSEMBLY
16238static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16239{
16240 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16241 {
16242 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16243 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16244 return fMxcsr | X86_MXCSR_IE;
16245 }
16246
16247 RTFLOAT32U r32Src1, r32Src2;
16248 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16249 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16250 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16251 {
16252 *pr32Res = r32Src2;
16253 return fMxcsr;
16254 }
16255
16256 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16257 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16258 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16259 fLe
16260 ? iemFpSoftF32FromIprt(&r32Src2)
16261 : iemFpSoftF32FromIprt(&r32Src1),
16262 pr32Res, fMxcsr);
16263}
16264
16265
16266IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16267{
16268 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16269 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16270 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16271 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16272}
16273#endif
16274
16275
16276/**
16277 * MAXSS
16278 */
16279#ifdef IEM_WITHOUT_ASSEMBLY
16280IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16281{
16282 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16283 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16284 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16285 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16286}
16287#endif
16288
16289
16290/**
16291 * MAXPD
16292 */
16293#ifdef IEM_WITHOUT_ASSEMBLY
16294static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16295{
16296 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16297 {
16298 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16299 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16300 return fMxcsr | X86_MXCSR_IE;
16301 }
16302
16303 RTFLOAT64U r64Src1, r64Src2;
16304 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16305 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16306 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16307 {
16308 *pr64Res = r64Src2;
16309 return fMxcsr;
16310 }
16311
16312 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16313 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16314 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16315 fLe
16316 ? iemFpSoftF64FromIprt(&r64Src2)
16317 : iemFpSoftF64FromIprt(&r64Src1),
16318 pr64Res, fMxcsr);
16319}
16320
16321
16322IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16323{
16324 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16325 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16326}
16327#endif
16328
16329
16330/**
16331 * MAXSD
16332 */
16333#ifdef IEM_WITHOUT_ASSEMBLY
16334IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16335{
16336 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16337 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16338}
16339#endif
16340
16341
16342/**
16343 * CVTSS2SD
16344 */
16345#ifdef IEM_WITHOUT_ASSEMBLY
16346static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16347{
16348 RTFLOAT32U r32Src1;
16349 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16350
16351 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16352 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16353 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16354}
16355
16356
16357IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16358{
16359 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
16360 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16361}
16362#endif
16363
16364
16365/**
16366 * CVTSD2SS
16367 */
16368#ifdef IEM_WITHOUT_ASSEMBLY
16369static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16370{
16371 RTFLOAT64U r64Src1;
16372 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16373
16374 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16375 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16376 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16377}
16378
16379
16380IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16381{
16382 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
16383 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16384 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16385 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16386}
16387#endif
16388
16389
16390/**
16391 * HADDPS
16392 */
16393#ifdef IEM_WITHOUT_ASSEMBLY
16394IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16395{
16396 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
16397 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
16398 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
16399 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16400}
16401#endif
16402
16403
16404/**
16405 * HADDPD
16406 */
16407#ifdef IEM_WITHOUT_ASSEMBLY
16408IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16409{
16410 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
16411 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16412}
16413#endif
16414
16415
16416/**
16417 * HSUBPS
16418 */
16419#ifdef IEM_WITHOUT_ASSEMBLY
16420IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16421{
16422 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
16423 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
16424 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
16425 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16426}
16427#endif
16428
16429
16430/**
16431 * HSUBPD
16432 */
16433#ifdef IEM_WITHOUT_ASSEMBLY
16434IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16435{
16436 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
16437 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16438}
16439#endif
16440
16441
16442/**
16443 * SQRTPS
16444 */
16445#ifdef IEM_WITHOUT_ASSEMBLY
16446static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16447{
16448 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16449 return fMxcsr;
16450
16451 RTFLOAT32U r32Src;
16452 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
16453 if (RTFLOAT32U_IS_ZERO(&r32Src))
16454 {
16455 *pr32Res = r32Src;
16456 return fMxcsr;
16457 }
16458 else if (r32Src.s.fSign)
16459 {
16460 *pr32Res = g_ar32QNaN[1];
16461 return fMxcsr | X86_MXCSR_IE;
16462 }
16463
16464 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16465 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16466 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16467}
16468
16469
16470IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16471{
16472 RT_NOREF(puSrc1);
16473
16474 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16475 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16476 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16477 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16478}
16479#endif
16480
16481
16482/**
16483 * SQRTSS
16484 */
16485#ifdef IEM_WITHOUT_ASSEMBLY
16486IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16487{
16488 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16489 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16490 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16491 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16492}
16493#endif
16494
16495
16496/**
16497 * SQRTPD
16498 */
16499#ifdef IEM_WITHOUT_ASSEMBLY
16500static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16501{
16502 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16503 return fMxcsr;
16504
16505 RTFLOAT64U r64Src;
16506 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16507 if (RTFLOAT64U_IS_ZERO(&r64Src))
16508 {
16509 *pr64Res = r64Src;
16510 return fMxcsr;
16511 }
16512 else if (r64Src.s.fSign)
16513 {
16514 *pr64Res = g_ar64QNaN[1];
16515 return fMxcsr | X86_MXCSR_IE;
16516 }
16517
16518 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16519 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16520 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16521}
16522
16523
16524IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16525{
16526 RT_NOREF(puSrc1);
16527
16528 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16529 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16530}
16531#endif
16532
16533
16534/**
16535 * SQRTSD
16536 */
16537#ifdef IEM_WITHOUT_ASSEMBLY
16538IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16539{
16540 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
16541 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16542}
16543#endif
16544
16545
16546#ifdef IEM_WITHOUT_ASSEMBLY
16547/**
16548 * RSQRTPS
16549 */
16550static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16551{
16552 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16553 return fMxcsr;
16554
16555 RTFLOAT32U r32Src;
16556 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16557 if (RTFLOAT32U_IS_ZERO(&r32Src))
16558 {
16559 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16560 return fMxcsr;
16561 }
16562 else if (r32Src.s.fSign)
16563 {
16564 *pr32Res = g_ar32QNaN[1];
16565 return fMxcsr | X86_MXCSR_IE;
16566 }
16567
16568 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16569 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16570 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16571}
16572
16573
16574IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16575{
16576 RT_NOREF(puSrc1);
16577
16578 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16579 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16580 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16581 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16582}
16583
16584
16585/**
16586 * RSQRTSS
16587 */
16588IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16589{
16590 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16591 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16592 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16593 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16594}
16595#endif
16596
16597
16598/**
16599 * RCPPS
16600 */
16601#ifdef IEM_WITHOUT_ASSEMBLY
16602static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16603{
16604 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16605 return fMxcsr;
16606
16607 RTFLOAT32U r32Src;
16608 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16609 if (RTFLOAT32U_IS_ZERO(&r32Src))
16610 {
16611 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16612 return fMxcsr;
16613 }
16614
16615 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16616 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16617 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16618}
16619
16620
16621IEM_DECL_IMPL_DEF(void, iemAImpl_rcpps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16622{
16623 RT_NOREF(puSrc1);
16624
16625 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16626 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16627 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16628 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16629}
16630
16631
16632/**
16633 * RCPSS
16634 */
16635IEM_DECL_IMPL_DEF(void, iemAImpl_rcpss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16636{
16637 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16638 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16639 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16640 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16641}
16642#endif
16643
16644
16645/**
16646 * ADDSUBPS
16647 */
16648#ifdef IEM_WITHOUT_ASSEMBLY
16649IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16650{
16651 RT_NOREF(puSrc1);
16652
16653 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16654 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16655 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16656 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16657}
16658#endif
16659
16660
16661/**
16662 * ADDSUBPD
16663 */
16664#ifdef IEM_WITHOUT_ASSEMBLY
16665IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16666{
16667 RT_NOREF(puSrc1);
16668
16669 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16670 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16671}
16672#endif
16673
16674
16675/**
16676 * CVTPD2PS
16677 */
16678#ifdef IEM_WITHOUT_ASSEMBLY
16679static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16680{
16681 RTFLOAT64U r64Src1;
16682 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16683
16684 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16685 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16686 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16687}
16688
16689
16690IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16691{
16692 RT_NOREF(puSrc1);
16693
16694 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16695 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16696 pResult->uResult.au32[2] = 0;
16697 pResult->uResult.au32[3] = 0;
16698}
16699#endif
16700
16701
16702/**
16703 * CVTPS2PD
16704 */
16705#ifdef IEM_WITHOUT_ASSEMBLY
16706static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16707{
16708 RTFLOAT32U r32Src1;
16709 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16710
16711 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16712 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16713 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16714}
16715
16716
16717IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16718{
16719 RT_NOREF(puSrc1);
16720
16721 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16722 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16723}
16724#endif
16725
16726
16727/**
16728 * CVTDQ2PS
16729 */
16730#ifdef IEM_WITHOUT_ASSEMBLY
16731static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16732{
16733 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16734 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16735 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16736}
16737
16738
16739IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16740{
16741 RT_NOREF(puSrc1);
16742
16743 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16744 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16745 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
16746 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
16747}
16748#endif
16749
16750
16751/**
16752 * CVTPS2DQ
16753 */
16754#ifdef IEM_WITHOUT_ASSEMBLY
16755static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16756{
16757 RTFLOAT32U r32Src;
16758 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16759
16760 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16761 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16762 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16763}
16764
16765
16766IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16767{
16768 RT_NOREF(puSrc1);
16769
16770 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16771 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16772 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16773 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16774}
16775#endif
16776
16777
16778/**
16779 * CVTTPS2DQ
16780 */
16781#ifdef IEM_WITHOUT_ASSEMBLY
16782static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16783{
16784 RTFLOAT32U r32Src;
16785 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16786
16787 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16788 SoftState.roundingMode = softfloat_round_minMag;
16789 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16790 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16791}
16792
16793
16794IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16795{
16796 RT_NOREF(puSrc1);
16797
16798 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16799 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16800 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16801 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16802}
16803#endif
16804
16805
16806/**
16807 * CVTTPD2DQ
16808 */
16809#ifdef IEM_WITHOUT_ASSEMBLY
16810static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16811{
16812 RTFLOAT64U r64Src;
16813 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16814
16815 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16816 SoftState.roundingMode = softfloat_round_minMag;
16817 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16818 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16819}
16820
16821
16822IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16823{
16824 RT_NOREF(puSrc1);
16825
16826 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16827 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16828 pResult->uResult.au64[1] = 0;
16829}
16830#endif
16831
16832
16833/**
16834 * CVTDQ2PD
16835 */
16836#ifdef IEM_WITHOUT_ASSEMBLY
16837static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16838{
16839 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16840 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16841 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16842}
16843
16844
16845IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16846{
16847 RT_NOREF(puSrc1);
16848
16849 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16850 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16851}
16852#endif
16853
16854
16855/**
16856 * CVTPD2DQ
16857 */
16858#ifdef IEM_WITHOUT_ASSEMBLY
16859static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16860{
16861 RTFLOAT64U r64Src;
16862 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16863
16864 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16865 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16866 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16867}
16868
16869
16870IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16871{
16872 RT_NOREF(puSrc1);
16873
16874 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16875 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16876 pResult->uResult.au64[1] = 0;
16877}
16878#endif
16879
16880
16881/**
16882 * [V]SHUFPS
16883 */
16884#ifdef IEM_WITHOUT_ASSEMBLY
16885IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16886{
16887 RTUINT128U const uSrc1 = *puDst;
16888 RTUINT128U const uSrc2 = *puSrc;
16889 ASMCompilerBarrier();
16890 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16891 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16892 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16893 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16894}
16895#endif
16896
16897
16898IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16899{
16900 RTUINT128U const uSrc1 = *puSrc1;
16901 RTUINT128U const uSrc2 = *puSrc2;
16902 ASMCompilerBarrier();
16903 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16904 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16905 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16906 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16907}
16908
16909
16910IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16911{
16912 RTUINT256U const uSrc1 = *puSrc1;
16913 RTUINT256U const uSrc2 = *puSrc2;
16914 ASMCompilerBarrier();
16915 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16916 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16917 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16918 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16919
16920 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
16921 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
16922 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
16923 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
16924}
16925
16926
16927/**
16928 * [V]SHUFPD
16929 */
16930#ifdef IEM_WITHOUT_ASSEMBLY
16931IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16932{
16933 RTUINT128U const uSrc1 = *puDst;
16934 RTUINT128U const uSrc2 = *puSrc;
16935 ASMCompilerBarrier();
16936 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16937 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16938}
16939#endif
16940
16941
16942IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16943{
16944 RTUINT128U const uSrc1 = *puSrc1;
16945 RTUINT128U const uSrc2 = *puSrc2;
16946 ASMCompilerBarrier();
16947 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16948 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16949}
16950
16951
16952IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16953{
16954 RTUINT256U const uSrc1 = *puSrc1;
16955 RTUINT256U const uSrc2 = *puSrc2;
16956 ASMCompilerBarrier();
16957 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16958 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16959 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
16960 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
16961}
16962
16963
16964/*
16965 * PHMINPOSUW / VPHMINPOSUW
16966 */
16967IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16968{
16969 uint16_t u16Min = puSrc->au16[0];
16970 uint8_t idxMin = 0;
16971
16972 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
16973 if (puSrc->au16[i] < u16Min)
16974 {
16975 u16Min = puSrc->au16[i];
16976 idxMin = i;
16977 }
16978
16979 puDst->au64[0] = 0;
16980 puDst->au64[1] = 0;
16981 puDst->au16[0] = u16Min;
16982 puDst->au16[1] = idxMin;
16983}
16984
16985
16986IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16987{
16988 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
16989}
16990
16991
16992/**
16993 * VPERMILPS
16994 */
16995#ifdef IEM_WITHOUT_ASSEMBLY
16996IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16997{
16998 RTUINT128U const uSrc = *puSrc;
16999 ASMCompilerBarrier();
17000
17001 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17002 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17003 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17004 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17005}
17006
17007
17008IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17009{
17010 RTUINT256U const uSrc = *puSrc;
17011 ASMCompilerBarrier();
17012
17013 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17014 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17015 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17016 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17017
17018 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17019 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17020 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17021 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17022}
17023
17024IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17025{
17026 RTUINT128U const uSrc1 = *puSrc1;
17027 RTUINT128U const uSrc2 = *puSrc2;
17028 ASMCompilerBarrier();
17029
17030 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17031 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17032 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17033 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17034}
17035
17036IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17037{
17038 RTUINT256U const uSrc1 = *puSrc1;
17039 RTUINT256U const uSrc2 = *puSrc2;
17040 ASMCompilerBarrier();
17041
17042 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17043 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17044 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17045 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17046
17047 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17048 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17049 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17050 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17051}
17052#endif
17053
17054
17055IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17056{
17057 RTUINT128U const uSrc = *puSrc;
17058 ASMCompilerBarrier();
17059
17060 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17061 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17062 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17063 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17064}
17065
17066
17067IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17068{
17069 RTUINT256U const uSrc = *puSrc;
17070 ASMCompilerBarrier();
17071
17072 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17073 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17074 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17075 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17076
17077 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17078 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17079 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17080 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17081}
17082
17083IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17084{
17085 RTUINT128U const uSrc1 = *puSrc1;
17086 RTUINT128U const uSrc2 = *puSrc2;
17087 ASMCompilerBarrier();
17088
17089 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17090 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17091 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17092 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17093}
17094
17095IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17096{
17097 RTUINT256U const uSrc1 = *puSrc1;
17098 RTUINT256U const uSrc2 = *puSrc2;
17099 ASMCompilerBarrier();
17100
17101 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17102 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17103 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17104 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17105
17106 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17107 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17108 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17109 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17110}
17111
17112
17113/**
17114 * VPERMILPD
17115 */
17116#ifdef IEM_WITHOUT_ASSEMBLY
17117IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17118{
17119 RTUINT128U const uSrc = *puSrc;
17120 ASMCompilerBarrier();
17121
17122 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17123 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17124}
17125
17126
17127IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17128{
17129 RTUINT256U const uSrc = *puSrc;
17130 ASMCompilerBarrier();
17131
17132 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17133 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17134
17135 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17136 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17137}
17138
17139IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17140{
17141 RTUINT128U const uSrc1 = *puSrc1;
17142 RTUINT128U const uSrc2 = *puSrc2;
17143 ASMCompilerBarrier();
17144
17145 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17146 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17147}
17148
17149IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17150{
17151 RTUINT256U const uSrc1 = *puSrc1;
17152 RTUINT256U const uSrc2 = *puSrc2;
17153 ASMCompilerBarrier();
17154
17155 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17156 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17157
17158 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17159 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17160}
17161#endif
17162
17163
17164IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17165{
17166 RTUINT128U const uSrc = *puSrc;
17167 ASMCompilerBarrier();
17168
17169 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17170 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17171}
17172
17173
17174IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17175{
17176 RTUINT256U const uSrc = *puSrc;
17177 ASMCompilerBarrier();
17178
17179 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17180 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17181
17182 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17183 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17184}
17185
17186IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17187{
17188 RTUINT128U const uSrc1 = *puSrc1;
17189 RTUINT128U const uSrc2 = *puSrc2;
17190 ASMCompilerBarrier();
17191
17192 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17193 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17194}
17195
17196IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17197{
17198 RTUINT256U const uSrc1 = *puSrc1;
17199 RTUINT256U const uSrc2 = *puSrc2;
17200 ASMCompilerBarrier();
17201
17202 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17203 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17204
17205 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17206 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17207}
17208
17209
17210/*
17211 * [V]PBLENDVB
17212 */
17213IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17214{
17215 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17216 if (puMask->au8[i] & RT_BIT(7))
17217 puDst->au8[i] = puSrc->au8[i];
17218}
17219
17220
17221IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17222{
17223 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17224 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17225}
17226
17227
17228IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17229{
17230 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17231 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17232}
17233
17234
17235/*
17236 * [V]BLENDVPS
17237 */
17238IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17239{
17240 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17241 if (puMask->au32[i] & RT_BIT_32(31))
17242 puDst->au32[i] = puSrc->au32[i];
17243}
17244
17245
17246IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17247{
17248 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17249 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17250}
17251
17252
17253IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17254{
17255 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17256 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17257}
17258
17259
17260/*
17261 * [V]BLENDVPD
17262 */
17263IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17264{
17265 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
17266 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
17267}
17268
17269
17270IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17271{
17272 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17273 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17274}
17275
17276
17277IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17278{
17279 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17280 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17281}
17282
17283
17284/**
17285 * [V]PALIGNR
17286 */
17287IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
17288{
17289 uint64_t const u64Src1 = *pu64Dst;
17290 ASMCompilerBarrier();
17291
17292 if (bEvil >= 16)
17293 *pu64Dst = 0;
17294 else if (bEvil >= 8)
17295 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
17296 else
17297 {
17298 uint8_t cShift = bEvil * 8;
17299 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
17300 | (u64Src2 >> cShift);
17301 }
17302}
17303
17304
17305IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17306{
17307 RTUINT128U const uSrc1 = *puDst;
17308 RTUINT128U const uSrc2 = *puSrc;
17309 ASMCompilerBarrier();
17310
17311 puDst->au64[0] = 0;
17312 puDst->au64[1] = 0;
17313 if (bEvil >= 32)
17314 { /* Everything stays 0. */ }
17315 else if (bEvil >= 16)
17316 {
17317 bEvil -= 16;
17318 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17319 puDst->au8[i - bEvil] = uSrc1.au8[i];
17320 }
17321 else
17322 {
17323 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17324 puDst->au8[i] = uSrc2.au8[i + bEvil];
17325 for (uint8_t i = 0; i < bEvil; i++)
17326 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17327 }
17328}
17329
17330
17331IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17332{
17333 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17334 RTUINT128U const uSrc2 = *puSrc2;
17335 ASMCompilerBarrier();
17336
17337 puDst->au64[0] = 0;
17338 puDst->au64[1] = 0;
17339 if (bEvil >= 32)
17340 { /* Everything stays 0. */ }
17341 else if (bEvil >= 16)
17342 {
17343 bEvil -= 16;
17344 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17345 puDst->au8[i - bEvil] = uSrc1.au8[i];
17346 }
17347 else
17348 {
17349 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17350 puDst->au8[i] = uSrc2.au8[i + bEvil];
17351 for (uint8_t i = 0; i < bEvil; i++)
17352 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17353 }
17354}
17355
17356
17357IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17358{
17359 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17360 RTUINT256U const uSrc2 = *puSrc2;
17361 ASMCompilerBarrier();
17362
17363 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
17364 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
17365}
17366
17367
17368/**
17369 * [V]PBLENDW
17370 */
17371IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17372{
17373 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17374 if (bEvil & RT_BIT(i))
17375 puDst->au16[i] = puSrc->au16[i];
17376}
17377
17378
17379IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17380{
17381 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17382 if (bEvil & RT_BIT(i))
17383 puDst->au16[i] = puSrc2->au16[i];
17384 else
17385 puDst->au16[i] = puSrc1->au16[i];
17386}
17387
17388
17389IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17390{
17391 for (uint8_t i = 0; i < 8; i++)
17392 if (bEvil & RT_BIT(i))
17393 {
17394 puDst->au16[ i] = puSrc2->au16[ i];
17395 puDst->au16[8 + i] = puSrc2->au16[8 + i];
17396 }
17397 else
17398 {
17399 puDst->au16[ i] = puSrc1->au16[ i];
17400 puDst->au16[8 + i] = puSrc1->au16[8 + i];
17401 }
17402}
17403
17404
17405/**
17406 * [V]BLENDPS
17407 */
17408IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17409{
17410 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17411 if (bEvil & RT_BIT(i))
17412 puDst->au32[i] = puSrc->au32[i];
17413}
17414
17415
17416IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17417{
17418 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17419 if (bEvil & RT_BIT(i))
17420 puDst->au32[i] = puSrc2->au32[i];
17421 else
17422 puDst->au32[i] = puSrc1->au32[i];
17423}
17424
17425
17426IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17427{
17428 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17429 if (bEvil & RT_BIT(i))
17430 puDst->au32[i] = puSrc2->au32[i];
17431 else
17432 puDst->au32[i] = puSrc1->au32[i];
17433}
17434
17435
17436/**
17437 * [V]BLENDPD
17438 */
17439IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17440{
17441 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17442 if (bEvil & RT_BIT(i))
17443 puDst->au64[i] = puSrc->au64[i];
17444}
17445
17446
17447IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17448{
17449 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17450 if (bEvil & RT_BIT(i))
17451 puDst->au64[i] = puSrc2->au64[i];
17452 else
17453 puDst->au64[i] = puSrc1->au64[i];
17454}
17455
17456
17457IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17458{
17459 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17460 if (bEvil & RT_BIT(i))
17461 puDst->au64[i] = puSrc2->au64[i];
17462 else
17463 puDst->au64[i] = puSrc1->au64[i];
17464}
17465
17466
17467/**
17468 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
17469 */
17470
17471static uint8_t iemAImpl_aes_sbox[] = {
17472 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
17473 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
17474 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
17475 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
17476 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
17477 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
17478 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
17479 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
17480 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
17481 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
17482 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
17483 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17484 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
17485 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
17486 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
17487 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
17488};
17489
17490/* The InvS-Box lookup table. */
17491static uint8_t iemAImpl_aes_inv_sbox[] = {
17492 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
17493 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
17494 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
17495 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
17496 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
17497 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
17498 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
17499 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
17500 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
17501 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
17502 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
17503 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
17504 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
17505 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
17506 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
17507 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
17508};
17509
17510/* The ShiftRows lookup table. */
17511static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
17512 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
17513};
17514
17515/* The InvShiftRows lookup table. */
17516static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
17517 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
17518};
17519
17520static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
17521{
17522 RTUINT128U uVal;
17523 int i;
17524
17525 for (i = 0; i < 16; ++i)
17526 uVal.au8[i] = abSubst[puSrc->au8[i]];
17527
17528 return uVal;
17529}
17530
17531static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
17532{
17533 return (u << 1) ^ (((u >> 7) & 1) * 27);
17534}
17535
17536static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
17537{
17538 RTUINT128U uVal;
17539 int i;
17540 uint8_t tmp;
17541
17542 for (i = 0; i < 16; i += 4) {
17543 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
17544 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
17545 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
17546 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
17547 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
17548 }
17549
17550 return uVal;
17551}
17552
17553static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
17554{
17555 RTUINT128U uVal;
17556 int i;
17557
17558 for (i = 0; i < 16; ++i)
17559 uVal.au8[i] = puSrc->au8[abShift[i]];
17560
17561 return uVal;
17562}
17563
17564static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
17565{
17566 uint8_t val;
17567
17568 val = ((b >> 0) & 1) * a;
17569 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
17570 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
17571 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
17572 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
17573
17574 return val;
17575}
17576
17577static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
17578{
17579 RTUINT128U uVal;
17580 int i;
17581
17582 for (i = 0; i < 16; i += 4) {
17583 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
17584 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
17585 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
17586 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
17587 }
17588
17589 return uVal;
17590}
17591
17592static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
17593{
17594 RTUINT32U uTmp;
17595
17596 uTmp.au32[0] = w;
17597 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
17598 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
17599 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
17600 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
17601
17602 return uTmp.au32[0];
17603}
17604
17605static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
17606{
17607 return (w << 24) | (w >> 8);
17608}
17609
17610/**
17611 * [V]AESKEYGENASSIST
17612 */
17613IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
17614{
17615 RTUINT128U uTmp;
17616 uint32_t uRCon = bImm; /* Round constant. */
17617
17618 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
17619 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
17620 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
17621 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
17622
17623 *puDst = uTmp;
17624}
17625
17626
17627/**
17628 * [V]AESIMC
17629 */
17630IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17631{
17632 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
17633}
17634
17635
17636/**
17637 * [V]AESENC
17638 */
17639IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17640{
17641 RTUINT128U uTmp;
17642
17643 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17644 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17645 uTmp = iemAImpl_aes_mix_col(&uTmp);
17646 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17647 uTmp.au64[1] ^= puSrc->au64[1];
17648
17649 *puDst = uTmp;
17650}
17651
17652
17653/**
17654 * [V]AESENCLAST
17655 */
17656IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17657{
17658 RTUINT128U uTmp;
17659
17660 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17661 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17662 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17663 uTmp.au64[1] ^= puSrc->au64[1];
17664
17665 *puDst = uTmp;
17666}
17667
17668
17669/**
17670 * [V]AESDEC
17671 */
17672IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17673{
17674 RTUINT128U uTmp;
17675
17676 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17677 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17678 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17679 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17680 uTmp.au64[1] ^= puSrc->au64[1];
17681
17682 *puDst = uTmp;
17683}
17684
17685
17686/**
17687 * [V]AESDECLAST
17688 */
17689IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17690{
17691 RTUINT128U uTmp;
17692
17693 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17694 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17695 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17696 uTmp.au64[1] ^= puSrc->au64[1];
17697
17698 *puDst = uTmp;
17699}
17700
17701
17702/**
17703 * [V]PCMPISTRI
17704 */
17705
17706/**
17707 * Does the comparisons based on the mode and source input format.
17708 */
17709static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17710{
17711#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17712 do \
17713 { \
17714 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17715 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17716 { \
17717 switch (a_bAggOp) \
17718 { \
17719 case 0: \
17720 case 2: \
17721 case 3: \
17722 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17723 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17724 break; \
17725 case 1: \
17726 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17727 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17728 break; \
17729 default: \
17730 AssertReleaseFailed(); \
17731 } \
17732 } \
17733 } while(0)
17734
17735 uint8_t bAggOp = (bImm >> 2) & 0x3;
17736 switch (bImm & 0x3)
17737 {
17738 case 0:
17739 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17740 break;
17741 case 1:
17742 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17743 break;
17744 case 2:
17745 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17746 break;
17747 case 3:
17748 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17749 break;
17750 default:
17751 AssertReleaseFailed();
17752 }
17753#undef PCMPXSTRX_CMP_CASE
17754}
17755
17756static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17757{
17758 if (bImm & 0x1)
17759 {
17760 /* Words -> 8 elements. */
17761 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17762 if (puSrc->au16[i] == 0)
17763 return i;
17764
17765 return 8;
17766 }
17767 else
17768 {
17769 /* Bytes -> 16 elements. */
17770 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17771 if (puSrc->au8[i] == 0)
17772 return i;
17773
17774 return 16;
17775 }
17776}
17777
17778static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17779{
17780 if (bImm & 0x1)
17781 {
17782 if (i64Len > -8 && i64Len < 8)
17783 return RT_ABS(i64Len);
17784
17785 return 8;
17786 }
17787 else
17788 {
17789 if (i64Len > -16 && i64Len < 16)
17790 return RT_ABS(i64Len);
17791
17792 return 16;
17793 }
17794}
17795
17796/**
17797 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17798 */
17799static const bool g_afCmpOverride[4][4] =
17800{
17801 /* xmm1 AND xmm2/m128 invalid, xmm1 invalid BUT xmm2/m128 valid, xmm1 valid BUT xmm2/m128 invalid, unused dummy/padding for parfait */
17802 { false, false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17803 { false, false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17804 { true, false, false, false }, /* Imm8[3:2] = 10b (equal each) */
17805 { true, true, false, false }, /* Imm8[3:2] = 11b (equal ordered) */
17806};
17807
17808DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17809{
17810 if (fSrc1Valid && fSrc2Valid)
17811 return fCmpRes;
17812
17813 uint8_t const bSrc1Valid = fSrc1Valid ? 2 : 0;
17814 uint8_t const bSrc2Valid = fSrc2Valid ? 1 : 0;
17815 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17816}
17817
17818static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
17819{
17820 uint8_t bAggOp = (bImm >> 2) & 0x3;
17821 uint16_t u16Result = 0;
17822
17823 switch (bAggOp)
17824 {
17825 case 0: /* Equal any */
17826 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17827 {
17828 uint16_t u16Res = 0;
17829 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
17830 {
17831 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17832 idxSrc1 < idxLen1,
17833 idxSrc2 < idxLen2,
17834 bAggOp))
17835 {
17836 u16Res = RT_BIT(idxSrc2);
17837 break;
17838 }
17839 }
17840
17841 u16Result |= u16Res;
17842 }
17843 break;
17844
17845 case 1: /* Ranges */
17846 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17847 {
17848 uint16_t u16Res = 0;
17849 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
17850 {
17851 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17852 idxSrc1 < idxLen1,
17853 idxSrc2 < idxLen2,
17854 bAggOp)
17855 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
17856 (idxSrc1 + 1) < idxLen1,
17857 idxSrc2 < idxLen2,
17858 bAggOp))
17859 {
17860 u16Res = RT_BIT(idxSrc2);
17861 break;
17862 }
17863 }
17864
17865 u16Result |= u16Res;
17866 }
17867 break;
17868
17869 case 2: /* Equal each */
17870 for (uint8_t i = 0; i < cElems; i++)
17871 {
17872 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
17873 i < idxLen1,
17874 i < idxLen2,
17875 bAggOp))
17876 u16Result |= RT_BIT(i);
17877 }
17878 break;
17879
17880 case 3: /* Equal ordered */
17881 u16Result = 0;
17882 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17883 {
17884 uint16_t u16Res = RT_BIT(idxSrc2);
17885 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
17886 {
17887 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
17888 idxSrc1 < idxLen1,
17889 k < idxLen2,
17890 bAggOp))
17891 {
17892 u16Res = 0;
17893 break;
17894 }
17895 }
17896
17897 u16Result |= u16Res;
17898 }
17899 break;
17900 }
17901
17902 /* Polarity selection. */
17903 switch ((bImm >> 4) & 0x3)
17904 {
17905 case 0:
17906 case 2:
17907 /* Nothing to do. */
17908 break;
17909 case 1:
17910 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
17911 break;
17912 case 3:
17913 u16Result ^= RT_BIT(idxLen2) - 1;
17914 break;
17915 default:
17916 AssertReleaseFailed();
17917 }
17918
17919 return u16Result;
17920}
17921
17922DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
17923{
17924 uint32_t fEFlags = 0;
17925
17926 if (u16Result)
17927 fEFlags |= X86_EFL_CF;
17928 if (cLen2 < cElems)
17929 fEFlags |= X86_EFL_ZF;
17930 if (cLen1 < cElems)
17931 fEFlags |= X86_EFL_SF;
17932 if (u16Result & 0x1)
17933 fEFlags |= X86_EFL_OF;
17934 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
17935}
17936
17937DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
17938 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
17939{
17940 bool afCmpRes[16][16];
17941 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17942
17943 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
17944 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
17945 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
17946
17947 return u16Result;
17948}
17949
17950DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17951{
17952 if (bImm & RT_BIT(6))
17953 {
17954 /* Index for MSB set. */
17955 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
17956 if (idxMsb)
17957 *pu32Ecx = idxMsb - 1;
17958 else
17959 *pu32Ecx = cElems;
17960 }
17961 else
17962 {
17963 /* Index for LSB set. */
17964 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
17965 if (idxLsb)
17966 *pu32Ecx = idxLsb - 1;
17967 else
17968 *pu32Ecx = cElems;
17969 }
17970}
17971
17972IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17973{
17974 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17975 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17976 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17977
17978 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17979 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17980}
17981
17982
17983/**
17984 * [V]PCMPESTRI
17985 */
17986IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17987{
17988 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17989 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17990 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17991
17992 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17993 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17994}
17995
17996
17997/**
17998 * [V]PCMPISTRM
17999 */
18000DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18001{
18002 if (bImm & RT_BIT(6))
18003 {
18004 /* Generate a mask. */
18005 if (cElems == 8)
18006 {
18007 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18008 if (u16Result & RT_BIT(i))
18009 puDst->au16[i] = 0xffff;
18010 else
18011 puDst->au16[i] = 0;
18012 }
18013 else
18014 {
18015 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
18016 if (u16Result & RT_BIT(i))
18017 puDst->au8[i] = 0xff;
18018 else
18019 puDst->au8[i] = 0;
18020 }
18021 }
18022 else
18023 {
18024 /* Store the result. */
18025 puDst->au64[0] = u16Result;
18026 puDst->au64[1] = 0;
18027 }
18028}
18029
18030IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18031{
18032 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18033 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18034 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18035
18036 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18037 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18038}
18039
18040
18041/**
18042 * [V]PCMPESTRM
18043 */
18044IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18045{
18046 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18047 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18048 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18049
18050 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18051 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18052}
18053
18054
18055/*
18056 * [V]PCLMULQDQ
18057 */
18058IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18059{
18060 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
18061}
18062
18063
18064IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18065{
18066 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
18067 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
18068
18069 puDst->au64[0] = 0;
18070 puDst->au64[1] = 0;
18071
18072 /*
18073 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
18074 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
18075 * and squeeze out some optimizations.
18076 */
18077 if (uSrc1 & 0x1)
18078 puDst->au64[0] = uSrc2;
18079
18080 uSrc1 >>= 1;
18081
18082 uint8_t iDigit = 1;
18083 while (uSrc1)
18084 {
18085 if (uSrc1 & 0x1)
18086 {
18087 puDst->au64[0] ^= (uSrc2 << iDigit);
18088 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
18089 }
18090
18091 uSrc1 >>= 1;
18092 iDigit++;
18093 }
18094}
18095
18096
18097/**
18098 * [V]PINSRW
18099 */
18100#ifdef IEM_WITHOUT_ASSEMBLY
18101IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
18102{
18103 uint8_t cShift = (bEvil & 0x3) * 16;
18104 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
18105}
18106
18107
18108IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
18109{
18110 puDst->au16[bEvil & 0x7] = u16Src;
18111}
18112#endif
18113
18114
18115IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
18116{
18117 *puDst = *puSrc;
18118 puDst->au16[bEvil & 0x7] = u16Src;
18119}
18120
18121
18122/**
18123 * [V]PEXTRW
18124 */
18125#ifdef IEM_WITHOUT_ASSEMBLY
18126IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
18127{
18128 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
18129}
18130
18131
18132IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
18133{
18134 *pu16Dst = puSrc->au16[bEvil & 0x7];
18135}
18136
18137#endif
18138
18139IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
18140{
18141 *pu16Dst = puSrc->au16[bEvil & 0x7];
18142}
18143
18144
18145/**
18146 * [V]MOVMSKPS
18147 */
18148#ifdef IEM_WITHOUT_ASSEMBLY
18149IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18150{
18151 *pu8Dst = puSrc->au32[0] >> 31;
18152 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18153 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18154 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18155}
18156
18157#endif
18158
18159IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18160{
18161 *pu8Dst = puSrc->au32[0] >> 31;
18162 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18163 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18164 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18165}
18166
18167
18168IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18169{
18170 *pu8Dst = puSrc->au32[0] >> 31;
18171 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18172 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18173 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18174 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
18175 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
18176 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
18177 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
18178}
18179
18180
18181/**
18182 * [V]MOVMSKPD
18183 */
18184#ifdef IEM_WITHOUT_ASSEMBLY
18185IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18186{
18187 *pu8Dst = puSrc->au64[0] >> 63;
18188 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18189}
18190
18191#endif
18192
18193IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18194{
18195 *pu8Dst = puSrc->au64[0] >> 63;
18196 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18197}
18198
18199
18200IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18201{
18202 *pu8Dst = puSrc->au64[0] >> 63;
18203 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18204 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
18205 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
18206}
18207
18208
18209/**
18210 * CVTTSD2SI
18211 */
18212#ifdef IEM_WITHOUT_ASSEMBLY
18213IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
18214{
18215 RTFLOAT64U r64Src;
18216
18217 r64Src.u = *pu64Src;
18218 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18219
18220 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18221 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18222 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18223}
18224
18225
18226IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
18227{
18228 RTFLOAT64U r64Src;
18229
18230 r64Src.u = *pu64Src;
18231 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18232
18233 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18234 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18235 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18236}
18237#endif
18238
18239
18240/**
18241 * CVTSD2SI
18242 */
18243#ifdef IEM_WITHOUT_ASSEMBLY
18244IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
18245{
18246 RTFLOAT64U r64Src;
18247
18248 r64Src.u = *pu64Src;
18249 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18250
18251 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18252 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18253 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18254}
18255
18256
18257IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
18258{
18259 RTFLOAT64U r64Src;
18260
18261 r64Src.u = *pu64Src;
18262 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
18263
18264 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18265 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18266 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18267}
18268#endif
18269
18270
18271/**
18272 * CVTTSS2SI
18273 */
18274#ifdef IEM_WITHOUT_ASSEMBLY
18275IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
18276{
18277 RTFLOAT32U r32Src;
18278
18279 r32Src.u = *pu32Src;
18280 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18281
18282 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18283 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18284 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18285}
18286
18287
18288IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
18289{
18290 RTFLOAT32U r32Src;
18291
18292 r32Src.u = *pu32Src;
18293 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18294
18295 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18296 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18297 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18298}
18299#endif
18300
18301
18302/**
18303 * CVTSS2SI
18304 */
18305#ifdef IEM_WITHOUT_ASSEMBLY
18306IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
18307{
18308 RTFLOAT32U r32Src;
18309
18310 r32Src.u = *pu32Src;
18311 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18312
18313 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18314 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18315 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18316}
18317
18318
18319IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
18320{
18321 RTFLOAT32U r32Src;
18322
18323 r32Src.u = *pu32Src;
18324 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
18325
18326 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18327 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18328 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18329}
18330#endif
18331
18332
18333/**
18334 * CVTSI2SD
18335 */
18336#ifdef IEM_WITHOUT_ASSEMBLY
18337IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
18338{
18339 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18340 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
18341 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
18342}
18343
18344
18345IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
18346{
18347 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18348 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
18349 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
18350}
18351#endif
18352
18353
18354/**
18355 * CVTSI2SS
18356 */
18357#ifdef IEM_WITHOUT_ASSEMBLY
18358IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
18359{
18360 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18361 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
18362 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
18363}
18364
18365
18366IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
18367{
18368 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
18369 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
18370 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
18371}
18372#endif
18373
18374
18375/**
18376 * [V]UCOMISS
18377 */
18378#ifdef IEM_WITHOUT_ASSEMBLY
18379IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18380{
18381 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18382
18383 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
18384 {
18385 *pfMxcsr |= X86_MXCSR_IE;
18386 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18387 }
18388 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
18389 {
18390 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18391 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18392 }
18393 else
18394 {
18395 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18396
18397 RTFLOAT32U r32Src1, r32Src2;
18398 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
18399 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
18400
18401 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18402 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18403 if (f32_eq(f32Src1, f32Src2, &SoftState))
18404 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18405 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18406 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18407 /* else: GREATER_THAN 000 */
18408
18409 *pfMxcsr |= fDe;
18410 }
18411
18412 *pfEFlags = fEFlagsNew;
18413}
18414#endif
18415
18416IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18417{
18418 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18419}
18420
18421
18422/**
18423 * [V]UCOMISD
18424 */
18425#ifdef IEM_WITHOUT_ASSEMBLY
18426IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18427{
18428 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18429
18430 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
18431 {
18432 *pfMxcsr |= X86_MXCSR_IE;
18433 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18434 }
18435 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
18436 {
18437 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18438 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18439 }
18440 else
18441 {
18442 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18443
18444 RTFLOAT64U r64Src1, r64Src2;
18445 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0])
18446 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
18447
18448 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18449 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18450 if (f64_eq(f64Src1, f64Src2, &SoftState))
18451 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18452 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18453 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18454 /* else: GREATER_THAN 000 */
18455
18456 *pfMxcsr |= fDe;
18457 }
18458
18459 *pfEFlags = fEFlagsNew;
18460}
18461#endif
18462
18463IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18464{
18465 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18466}
18467
18468
18469/**
18470 * [V]COMISS
18471 */
18472#ifdef IEM_WITHOUT_ASSEMBLY
18473IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18474{
18475 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18476
18477 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
18478 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
18479 {
18480 *pfMxcsr |= X86_MXCSR_IE;
18481 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18482 }
18483 else
18484 {
18485 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18486
18487 RTFLOAT32U r32Src1, r32Src2;
18488 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0])
18489 | iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
18490
18491 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18492 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18493 if (f32_eq(f32Src1, f32Src2, &SoftState))
18494 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18495 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18496 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18497 /* else: GREATER_THAN 000 */
18498
18499 *pfMxcsr |= fDe;
18500 }
18501
18502 *pfEFlags = fEFlagsNew;
18503}
18504#endif
18505
18506
18507IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18508{
18509 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18510}
18511
18512
18513/**
18514 * [V]COMISD
18515 */
18516#ifdef IEM_WITHOUT_ASSEMBLY
18517IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18518{
18519 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18520
18521 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
18522 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
18523 {
18524 *pfMxcsr |= X86_MXCSR_IE;
18525 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18526 }
18527 else
18528 {
18529 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18530
18531 RTFLOAT64U r64Src1, r64Src2;
18532 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
18533 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
18534
18535 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18536 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18537 if (f64_eq(f64Src1, f64Src2, &SoftState))
18538 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18539 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18540 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18541 /* else: GREATER_THAN 000 */
18542
18543 *pfMxcsr |= fDe;
18544 }
18545
18546 *pfEFlags = fEFlagsNew;
18547}
18548#endif
18549
18550IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18551{
18552 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18553}
18554
18555
18556/**
18557 * CMPPS / CMPPD / CMPSS / CMPSD
18558 */
18559#ifdef IEM_WITHOUT_ASSEMBLY
18560/**
18561 * A compare truth table entry.
18562 */
18563typedef struct CMPTRUTHTBLENTRY
18564{
18565 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
18566 bool fSignalsOnQNan;
18567 /** The boolean result when the input operands are unordered. */
18568 bool fUnordered;
18569 /** The boolean result when A = B. */
18570 bool fEqual;
18571 /** The boolean result when A < B. */
18572 bool fLowerThan;
18573 /** The boolean result when A > B. */
18574 bool fGreaterThan;
18575} CMPTRUTHTBLENTRY;
18576/** Pointer to a const truth table entry. */
18577typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
18578
18579
18580/** The compare truth table (indexed by immediate). */
18581static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
18582{
18583 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
18584 /* 00H (EQ_OQ) */ { false, false, true, false, false },
18585 /* 01H (LT_OS) */ { true, false, false, true, false },
18586 /* 02H (LE_OS) */ { true, false, true, true, false },
18587 /* 03H (UNORD_Q) */ { false, true, false, false, false },
18588 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
18589 /* 05H (NLT_US) */ { true, true, true, false, true },
18590 /* 06H (NLE_US) */ { true, true, false, false, true },
18591 /* 07H (ORQ_Q) */ { false, false, true, true, true },
18592 /** @todo AVX variants. */
18593};
18594
18595
18596static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
18597{
18598 bool fRes;
18599 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18600
18601 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
18602 {
18603 *pfMxcsr |= X86_MXCSR_IE;
18604 fRes = g_aCmpTbl[bEvil].fUnordered;
18605 }
18606 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
18607 {
18608 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18609 *pfMxcsr |= X86_MXCSR_IE;
18610 fRes = g_aCmpTbl[bEvil].fUnordered;
18611 }
18612 else
18613 {
18614 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18615
18616 RTFLOAT32U r32Src1, r32Src2;
18617 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
18618 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
18619
18620 *pfMxcsr |= fDe;
18621 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18622 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18623 if (f32_eq(f32Src1, f32Src2, &SoftState))
18624 fRes = g_aCmpTbl[bEvil].fEqual;
18625 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18626 fRes = g_aCmpTbl[bEvil].fLowerThan;
18627 else
18628 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18629 }
18630
18631 return fRes;
18632}
18633
18634
18635static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
18636{
18637 bool fRes;
18638 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18639
18640 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
18641 {
18642 *pfMxcsr |= X86_MXCSR_IE;
18643 fRes = g_aCmpTbl[bEvil].fUnordered;
18644 }
18645 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
18646 {
18647 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18648 *pfMxcsr |= X86_MXCSR_IE;
18649 fRes = g_aCmpTbl[bEvil].fUnordered;
18650 }
18651 else
18652 {
18653 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18654
18655 RTFLOAT64U r64Src1, r64Src2;
18656 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
18657 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
18658
18659 *pfMxcsr |= fDe;
18660 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18661 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18662 if (f64_eq(f64Src1, f64Src2, &SoftState))
18663 fRes = g_aCmpTbl[bEvil].fEqual;
18664 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18665 fRes = g_aCmpTbl[bEvil].fLowerThan;
18666 else
18667 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18668 }
18669
18670 return fRes;
18671}
18672
18673
18674IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18675{
18676 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18677 {
18678 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18679 puDst->au32[i] = UINT32_MAX;
18680 else
18681 puDst->au32[i] = 0;
18682 }
18683}
18684
18685
18686IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18687{
18688 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18689 {
18690 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18691 puDst->au64[i] = UINT64_MAX;
18692 else
18693 puDst->au64[i] = 0;
18694 }
18695}
18696
18697
18698IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18699{
18700 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18701 puDst->au32[0] = UINT32_MAX;
18702 else
18703 puDst->au32[0] = 0;
18704
18705 puDst->au32[1] = pSrc->uSrc1.au32[1];
18706 puDst->au64[1] = pSrc->uSrc1.au64[1];
18707}
18708
18709
18710IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18711{
18712 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18713 puDst->au64[0] = UINT64_MAX;
18714 else
18715 puDst->au64[0] = 0;
18716
18717 puDst->au64[1] = pSrc->uSrc1.au64[1];
18718}
18719#endif
18720
18721
18722/**
18723 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18724 */
18725
18726#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18727#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18728#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18729
18730#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18731
18732DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18733{
18734 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18735 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18736
18737 fMxcsr &= ~X86_MXCSR_RC_MASK;
18738 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18739 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18740}
18741
18742static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18743{
18744 RTFLOAT32U r32Src, r32Dst;
18745 float32_t f32Src;
18746 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18747 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18748
18749 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18750 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18751
18752 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18753 return r32Dst;
18754}
18755
18756static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18757{
18758 RTFLOAT64U r64Src, r64Dst;
18759 float64_t f64Src;
18760 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18761 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18762
18763 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18764 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18765
18766 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18767 return r64Dst;
18768}
18769
18770#ifdef IEM_WITHOUT_ASSEMBLY
18771IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18772{
18773 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18774 puDst->au32[1] = pSrc->uSrc1.au32[1];
18775 puDst->au64[1] = pSrc->uSrc1.au64[1];
18776}
18777
18778
18779IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18780{
18781 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18782 puDst->au64[1] = pSrc->uSrc1.au64[1];
18783}
18784#endif
18785
18786IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18787{
18788 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18789 {
18790 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18791 }
18792}
18793
18794
18795IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18796{
18797 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18798 {
18799 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18800 }
18801}
18802
18803/**
18804 * CVTPD2PI
18805 */
18806#ifdef IEM_WITHOUT_ASSEMBLY
18807static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18808{
18809 RTFLOAT64U r64Src;
18810 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18811
18812 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18813 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18814 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18815}
18816
18817
18818IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18819{
18820 RTUINT64U u64Res;
18821 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18822 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18823
18824 *pu64Dst = u64Res.u;
18825 *pfMxcsr = fMxcsrOut;
18826}
18827#endif
18828
18829
18830/**
18831 * CVTTPD2PI
18832 */
18833#ifdef IEM_WITHOUT_ASSEMBLY
18834static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18835{
18836 RTFLOAT64U r64Src;
18837 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18838
18839 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18840 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18841 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18842}
18843
18844
18845IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18846{
18847 RTUINT64U u64Res;
18848 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18849 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18850
18851 *pu64Dst = u64Res.u;
18852 *pfMxcsr = fMxcsrOut;
18853}
18854#endif
18855
18856
18857/**
18858 * CVTPI2PS
18859 */
18860#ifdef IEM_WITHOUT_ASSEMBLY
18861static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
18862{
18863 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18864 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
18865 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
18866}
18867
18868
18869IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
18870{
18871 RTUINT64U uSrc = { u64Src };
18872 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
18873 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
18874 *pfMxcsr = fMxcsrOut;
18875}
18876#endif
18877
18878
18879/**
18880 * CVTPI2PD
18881 */
18882#ifdef IEM_WITHOUT_ASSEMBLY
18883static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
18884{
18885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18886 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
18887 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
18888}
18889
18890
18891IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
18892{
18893 RTUINT64U uSrc = { u64Src };
18894 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
18895 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
18896 *pfMxcsr = fMxcsrOut;
18897}
18898#endif
18899
18900
18901/**
18902 * CVTPS2PI
18903 */
18904#ifdef IEM_WITHOUT_ASSEMBLY
18905static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18906{
18907 RTFLOAT32U r32Src;
18908 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18909
18910 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18911 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18912 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18913}
18914
18915
18916IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
18917{
18918 RTUINT64U uDst;
18919 RTUINT64U uSrc = { u64Src };
18920 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18921 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18922 *pu64Dst = uDst.u;
18923 *pfMxcsr = fMxcsrOut;
18924}
18925#endif
18926
18927
18928/**
18929 * CVTTPS2PI
18930 */
18931#ifdef IEM_WITHOUT_ASSEMBLY
18932static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18933{
18934 RTFLOAT32U r32Src;
18935 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18936
18937 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18938 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18939 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18940}
18941
18942
18943IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
18944{
18945 RTUINT64U uDst;
18946 RTUINT64U uSrc = { u64Src };
18947 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18948 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18949 *pu64Dst = uDst.u;
18950 *pfMxcsr = fMxcsrOut;
18951}
18952#endif
18953
18954/**
18955 * RDRAND
18956 */
18957IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18958{
18959 *puDst = 0;
18960 *pEFlags &= ~X86_EFL_STATUS_BITS;
18961 *pEFlags |= X86_EFL_CF;
18962}
18963
18964IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18965{
18966 *puDst = 0;
18967 *pEFlags &= ~X86_EFL_STATUS_BITS;
18968 *pEFlags |= X86_EFL_CF;
18969}
18970
18971IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18972{
18973 *puDst = 0;
18974 *pEFlags &= ~X86_EFL_STATUS_BITS;
18975 *pEFlags |= X86_EFL_CF;
18976}
18977
18978/**
18979 * RDSEED
18980 */
18981IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18982{
18983 *puDst = 0;
18984 *pEFlags &= ~X86_EFL_STATUS_BITS;
18985 *pEFlags |= X86_EFL_CF;
18986}
18987
18988IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18989{
18990 *puDst = 0;
18991 *pEFlags &= ~X86_EFL_STATUS_BITS;
18992 *pEFlags |= X86_EFL_CF;
18993}
18994
18995IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18996{
18997 *puDst = 0;
18998 *pEFlags &= ~X86_EFL_STATUS_BITS;
18999 *pEFlags |= X86_EFL_CF;
19000}
19001
19002
19003/**
19004 * SHA1NEXTE
19005 */
19006IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19007{
19008 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
19009
19010 puDst->au32[0] = puSrc->au32[0];
19011 puDst->au32[1] = puSrc->au32[1];
19012 puDst->au32[2] = puSrc->au32[2];
19013 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
19014}
19015
19016/**
19017 * SHA1MSG1
19018 */
19019IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19020{
19021 uint32_t u32W0 = puDst->au32[3];
19022 uint32_t u32W1 = puDst->au32[2];
19023 uint32_t u32W2 = puDst->au32[1];
19024 uint32_t u32W3 = puDst->au32[0];
19025 uint32_t u32W4 = puSrc->au32[3];
19026 uint32_t u32W5 = puSrc->au32[2];
19027
19028 puDst->au32[3] = u32W2 ^ u32W0;
19029 puDst->au32[2] = u32W3 ^ u32W1;
19030 puDst->au32[1] = u32W4 ^ u32W2;
19031 puDst->au32[0] = u32W5 ^ u32W3;
19032}
19033
19034/**
19035 * SHA1MSG2
19036 */
19037IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19038{
19039 uint32_t u32W13 = puSrc->au32[2];
19040 uint32_t u32W14 = puSrc->au32[1];
19041 uint32_t u32W15 = puSrc->au32[0];
19042 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
19043 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
19044 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
19045 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
19046
19047 puDst->au32[3] = u32W16;
19048 puDst->au32[2] = u32W17;
19049 puDst->au32[1] = u32W18;
19050 puDst->au32[0] = u32W19;
19051}
19052
19053/**
19054 * SHA1RNDS4
19055 */
19056typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
19057typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
19058
19059static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19060{
19061 return (u32B & u32C) ^ (~u32B & u32D);
19062}
19063
19064static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19065{
19066 return u32B ^ u32C ^ u32D;
19067}
19068
19069static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19070{
19071 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
19072}
19073
19074static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19075{
19076 return u32B ^ u32C ^ u32D;
19077}
19078
19079IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19080{
19081 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
19082 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
19083
19084 uint32_t au32A[5];
19085 uint32_t au32B[5];
19086 uint32_t au32C[5];
19087 uint32_t au32D[5];
19088 uint32_t au32E[5];
19089 uint32_t au32W[4];
19090 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
19091 uint32_t u32K = s_au32K[bEvil & 0x3];
19092
19093 au32A[0] = puDst->au32[3];
19094 au32B[0] = puDst->au32[2];
19095 au32C[0] = puDst->au32[1];
19096 au32D[0] = puDst->au32[0];
19097 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
19098 au32W[i] = puSrc->au32[3 - i];
19099
19100 /* Round 0 is a bit different than the other rounds. */
19101 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
19102 au32B[1] = au32A[0];
19103 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
19104 au32D[1] = au32C[0];
19105 au32E[1] = au32D[0];
19106
19107 for (uint32_t i = 1; i <= 3; i++)
19108 {
19109 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
19110 au32B[i + 1] = au32A[i];
19111 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
19112 au32D[i + 1] = au32C[i];
19113 au32E[i + 1] = au32D[i];
19114 }
19115
19116 puDst->au32[3] = au32A[4];
19117 puDst->au32[2] = au32B[4];
19118 puDst->au32[1] = au32C[4];
19119 puDst->au32[0] = au32D[4];
19120}
19121
19122
19123/**
19124 * SHA256MSG1
19125 */
19126DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
19127{
19128 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
19129}
19130
19131IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19132{
19133 uint32_t u32W4 = puSrc->au32[0];
19134 uint32_t u32W3 = puDst->au32[3];
19135 uint32_t u32W2 = puDst->au32[2];
19136 uint32_t u32W1 = puDst->au32[1];
19137 uint32_t u32W0 = puDst->au32[0];
19138
19139 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
19140 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
19141 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
19142 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
19143}
19144
19145/**
19146 * SHA256MSG2
19147 */
19148DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
19149{
19150 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
19151}
19152
19153IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19154{
19155 uint32_t u32W14 = puSrc->au32[2];
19156 uint32_t u32W15 = puSrc->au32[3];
19157 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
19158 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
19159 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
19160 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
19161
19162 puDst->au32[3] = u32W19;
19163 puDst->au32[2] = u32W18;
19164 puDst->au32[1] = u32W17;
19165 puDst->au32[0] = u32W16;
19166}
19167
19168/**
19169 * SHA256RNDS2
19170 */
19171DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19172{
19173 return (u32X & u32Y) ^ (~u32X & u32Z);
19174}
19175
19176DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19177{
19178 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
19179}
19180
19181DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
19182{
19183 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
19184}
19185
19186DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
19187{
19188 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
19189}
19190
19191IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
19192{
19193 uint32_t au32A[3];
19194 uint32_t au32B[3];
19195 uint32_t au32C[3];
19196 uint32_t au32D[3];
19197 uint32_t au32E[3];
19198 uint32_t au32F[3];
19199 uint32_t au32G[3];
19200 uint32_t au32H[3];
19201 uint32_t au32WK[2];
19202
19203 au32A[0] = puSrc->au32[3];
19204 au32B[0] = puSrc->au32[2];
19205 au32C[0] = puDst->au32[3];
19206 au32D[0] = puDst->au32[2];
19207 au32E[0] = puSrc->au32[1];
19208 au32F[0] = puSrc->au32[0];
19209 au32G[0] = puDst->au32[1];
19210 au32H[0] = puDst->au32[0];
19211
19212 au32WK[0] = puXmm0Constants->au32[0];
19213 au32WK[1] = puXmm0Constants->au32[1];
19214
19215 for (uint32_t i = 0; i < 2; i++)
19216 {
19217 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19218 + iemAImpl_sha256_upper_sigma1(au32E[i])
19219 + au32WK[i]
19220 + au32H[i]
19221 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
19222 + iemAImpl_sha256_upper_sigma0(au32A[i]);
19223 au32B[i + 1] = au32A[i];
19224 au32C[i + 1] = au32B[i];
19225 au32D[i + 1] = au32C[i];
19226 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19227 + iemAImpl_sha256_upper_sigma1(au32E[i])
19228 + au32WK[i]
19229 + au32H[i]
19230 + au32D[i];
19231 au32F[i + 1] = au32E[i];
19232 au32G[i + 1] = au32F[i];
19233 au32H[i + 1] = au32G[i];
19234 }
19235
19236 puDst->au32[3] = au32A[2];
19237 puDst->au32[2] = au32B[2];
19238 puDst->au32[1] = au32E[2];
19239 puDst->au32[0] = au32F[2];
19240}
19241
19242
19243/**
19244 * ADCX
19245 */
19246#define ADX_EMIT(a_Flag, a_Type, a_Max) \
19247 do \
19248 { \
19249 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
19250 a_Type uTmp = *puDst + uSrc; \
19251 if (uTmp < uSrc) \
19252 *pfEFlags |= (a_Flag); \
19253 else \
19254 *pfEFlags &= ~(a_Flag); \
19255 if ( uTmp == a_Max \
19256 && f) \
19257 *pfEFlags |= (a_Flag); \
19258 if (f) \
19259 uTmp++; \
19260 *puDst = uTmp; \
19261 } \
19262 while (0)
19263
19264IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19265{
19266 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19267}
19268
19269IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19270{
19271 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19272}
19273
19274# if defined(IEM_WITHOUT_ASSEMBLY)
19275
19276IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19277{
19278 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19279}
19280
19281IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19282{
19283 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19284}
19285
19286#endif
19287
19288
19289/**
19290 * ADOX
19291 */
19292IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19293{
19294 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19295}
19296
19297IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19298{
19299 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19300}
19301
19302# if defined(IEM_WITHOUT_ASSEMBLY)
19303
19304IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19305{
19306 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19307}
19308
19309IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19310{
19311 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19312}
19313
19314# endif
19315
19316
19317/**
19318 * MPSADBW
19319 */
19320IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19321{
19322 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19323 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19324 int16_t ai16Src1[11];
19325 int16_t ai16Src2[4];
19326
19327 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19328 ai16Src1[i] = puDst->au8[idxSrc1 + i];
19329
19330 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19331 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
19332
19333 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19334 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19335 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19336 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19337 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19338}
19339
19340
19341IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
19342{
19343 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19344 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19345 int16_t ai16Src1[11];
19346 int16_t ai16Src2[4];
19347
19348 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19349 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
19350
19351 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19352 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
19353
19354 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19355 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19356 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19357 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19358 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19359}
19360
19361
19362IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
19363{
19364 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
19365 RTUINT256U const uSrc2 = *puSrc2;
19366 ASMCompilerBarrier();
19367 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
19368 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
19369}
19370
19371
19372/**
19373 * VPERM2I128
19374 */
19375IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19376{
19377 if (bImm & RT_BIT(3))
19378 {
19379 puDst->au64[0] = 0;
19380 puDst->au64[1] = 0;
19381 }
19382 else
19383 {
19384 switch (bImm & 0x3)
19385 {
19386 case 0:
19387 puDst->au64[0] = puSrc1->au64[0];
19388 puDst->au64[1] = puSrc1->au64[1];
19389 break;
19390 case 1:
19391 puDst->au64[0] = puSrc1->au64[2];
19392 puDst->au64[1] = puSrc1->au64[3];
19393 break;
19394 case 2:
19395 puDst->au64[0] = puSrc2->au64[0];
19396 puDst->au64[1] = puSrc2->au64[1];
19397 break;
19398 case 3:
19399 puDst->au64[0] = puSrc2->au64[2];
19400 puDst->au64[1] = puSrc2->au64[3];
19401 break;
19402 }
19403 }
19404
19405 if (bImm & RT_BIT(7))
19406 {
19407 puDst->au64[2] = 0;
19408 puDst->au64[3] = 0;
19409 }
19410 else
19411 {
19412 switch ((bImm >> 4) & 0x3)
19413 {
19414 case 0:
19415 puDst->au64[2] = puSrc1->au64[0];
19416 puDst->au64[3] = puSrc1->au64[1];
19417 break;
19418 case 1:
19419 puDst->au64[2] = puSrc1->au64[2];
19420 puDst->au64[3] = puSrc1->au64[3];
19421 break;
19422 case 2:
19423 puDst->au64[2] = puSrc2->au64[0];
19424 puDst->au64[3] = puSrc2->au64[1];
19425 break;
19426 case 3:
19427 puDst->au64[2] = puSrc2->au64[2];
19428 puDst->au64[3] = puSrc2->au64[3];
19429 break;
19430 }
19431 }
19432}
19433
19434
19435/**
19436 * VPERM2F128
19437 */
19438IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19439{
19440 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
19441}
19442
19443
19444/**
19445 * DPPS
19446 */
19447IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19448{
19449 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
19450 AssertReleaseFailed();
19451}
19452
19453
19454/**
19455 * DPPD
19456 */
19457IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19458{
19459 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
19460 AssertReleaseFailed();
19461}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette