VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 103205

Last change on this file since 103205 was 103186, checked in by vboxsync, 12 months ago

VMM/IEM: Some IEMAllAImplC.cpp adjustments. bugref:10372

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 718.3 KB
Line 
1/* $Id: IEMAllAImplC.cpp 103186 2024-02-04 15:43:38Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We clear AF, as that seems to make the most sense and also seems
138 * to be the correct behavior on current CPUs.
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT32U g_ar32One[];
464extern const RTFLOAT80U g_ar80One[];
465extern const RTFLOAT80U g_r80Indefinite;
466extern const RTFLOAT32U g_ar32Infinity[];
467extern const RTFLOAT64U g_ar64Infinity[];
468extern const RTFLOAT80U g_ar80Infinity[];
469extern const RTFLOAT128U g_r128Ln2;
470extern const RTUINT128U g_u128Ln2Mantissa;
471extern const RTUINT128U g_u128Ln2MantissaIntel;
472extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
473extern const RTFLOAT32U g_ar32QNaN[];
474extern const RTFLOAT64U g_ar64QNaN[];
475
476/** Zero values (indexed by fSign). */
477RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
478RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
479RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
480
481/** One values (indexed by fSign). */
482RTFLOAT32U const g_ar32One[] =
483{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
484RTFLOAT80U const g_ar80One[] =
485{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
486
487/** Indefinite (negative). */
488RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
489
490/** Infinities (indexed by fSign). */
491RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
492RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
493RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
494
495/** Default QNaNs (indexed by fSign). */
496RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
497RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
498
499
500#if 0
501/** 128-bit floating point constant: 2.0 */
502const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
503#endif
504
505
506/* The next section is generated by tools/IEMGenFpuConstants: */
507
508/** The ln2 constant as 128-bit floating point value.
509 * base-10: 6.93147180559945309417232121458176575e-1
510 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
511 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
512//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
513const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
514/** High precision ln2 value.
515 * base-10: 6.931471805599453094172321214581765680747e-1
516 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
517 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
518const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
519/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
520 * base-10: 6.931471805599453094151379470289064954613e-1
521 * base-16: b.17217f7d1cf79abc0000000000000000@-1
522 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
523const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
524
525/** Horner constants for f2xm1 */
526const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
527{
528 /* a0
529 * base-10: 1.00000000000000000000000000000000000e0
530 * base-16: 1.0000000000000000000000000000@0
531 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
532 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
533 /* a1
534 * base-10: 5.00000000000000000000000000000000000e-1
535 * base-16: 8.0000000000000000000000000000@-1
536 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
537 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
538 /* a2
539 * base-10: 1.66666666666666666666666666666666658e-1
540 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
541 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
542 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
543 /* a3
544 * base-10: 4.16666666666666666666666666666666646e-2
545 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
546 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
547 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
548 /* a4
549 * base-10: 8.33333333333333333333333333333333323e-3
550 * base-16: 2.2222222222222222222222222222@-2
551 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
552 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
553 /* a5
554 * base-10: 1.38888888888888888888888888888888874e-3
555 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
556 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
557 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
558 /* a6
559 * base-10: 1.98412698412698412698412698412698412e-4
560 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
561 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
562 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
563 /* a7
564 * base-10: 2.48015873015873015873015873015873015e-5
565 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
566 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
567 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
568 /* a8
569 * base-10: 2.75573192239858906525573192239858902e-6
570 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
571 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
572 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
573 /* a9
574 * base-10: 2.75573192239858906525573192239858865e-7
575 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
576 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
577 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
578 /* a10
579 * base-10: 2.50521083854417187750521083854417184e-8
580 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
581 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
582 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
583 /* a11
584 * base-10: 2.08767569878680989792100903212014296e-9
585 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
586 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
587 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
588 /* a12
589 * base-10: 1.60590438368216145993923771701549472e-10
590 * base-16: b.092309d43684be51c198e91d7b40@-9
591 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
592 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
593 /* a13
594 * base-10: 1.14707455977297247138516979786821043e-11
595 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
596 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
597 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
598 /* a14
599 * base-10: 7.64716373181981647590113198578806964e-13
600 * base-16: d.73f9f399dc0f88ec32b587746578@-11
601 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
602 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
603 /* a15
604 * base-10: 4.77947733238738529743820749111754352e-14
605 * base-16: d.73f9f399dc0f88ec32b587746578@-12
606 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
607 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
608 /* a16
609 * base-10: 2.81145725434552076319894558301031970e-15
610 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
611 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
612 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
613 /* a17
614 * base-10: 1.56192069685862264622163643500573321e-16
615 * base-16: b.413c31dcbecbbdd8024435161550@-14
616 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
617 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
618 /* a18
619 * base-10: 8.22063524662432971695598123687227980e-18
620 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
621 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
622 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
623 /* a19
624 * base-10: 4.11031762331216485847799061843614006e-19
625 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
626 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
627 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
628 /* a20
629 * base-10: 1.95729410633912612308475743735054143e-20
630 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
631 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
632 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
633 /* a21
634 * base-10: 8.89679139245057328674889744250246106e-22
635 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
636 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
637 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
638};
639
640
641/*
642 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
643 * it all in C is probably safer atm., optimize what's necessary later, maybe.
644 */
645#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
646
647
648/*********************************************************************************************************************************
649* Binary Operations *
650*********************************************************************************************************************************/
651
652/*
653 * ADD
654 */
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
657{
658 uint64_t uDst = *puDst;
659 uint64_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
662}
663
664# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
665
666IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
667{
668 uint32_t uDst = *puDst;
669 uint32_t uResult = uDst + uSrc;
670 *puDst = uResult;
671 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
672}
673
674
675IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
676{
677 uint16_t uDst = *puDst;
678 uint16_t uResult = uDst + uSrc;
679 *puDst = uResult;
680 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
681}
682
683
684IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
685{
686 uint8_t uDst = *puDst;
687 uint8_t uResult = uDst + uSrc;
688 *puDst = uResult;
689 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
690}
691
692# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
693
694/*
695 * ADC
696 */
697
698IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
699{
700 if (!(*pfEFlags & X86_EFL_CF))
701 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
702 else
703 {
704 uint64_t uDst = *puDst;
705 uint64_t uResult = uDst + uSrc + 1;
706 *puDst = uResult;
707 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
708 }
709}
710
711# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint32_t uDst = *puDst;
720 uint32_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
723 }
724}
725
726
727IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
728{
729 if (!(*pfEFlags & X86_EFL_CF))
730 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
731 else
732 {
733 uint16_t uDst = *puDst;
734 uint16_t uResult = uDst + uSrc + 1;
735 *puDst = uResult;
736 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
737 }
738}
739
740
741IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
742{
743 if (!(*pfEFlags & X86_EFL_CF))
744 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
745 else
746 {
747 uint8_t uDst = *puDst;
748 uint8_t uResult = uDst + uSrc + 1;
749 *puDst = uResult;
750 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
751 }
752}
753
754# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
755
756/*
757 * SUB
758 */
759# if !defined(RT_ARCH_ARM64)
760
761IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
762{
763 uint64_t uDst = *puDst;
764 uint64_t uResult = uDst - uSrc;
765 *puDst = uResult;
766 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
767}
768
769# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
770
771IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
772{
773 uint32_t uDst = *puDst;
774 uint32_t uResult = uDst - uSrc;
775 *puDst = uResult;
776 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
777}
778
779
780IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
781{
782 uint16_t uDst = *puDst;
783 uint16_t uResult = uDst - uSrc;
784 *puDst = uResult;
785 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
786}
787
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
790{
791 uint8_t uDst = *puDst;
792 uint8_t uResult = uDst - uSrc;
793 *puDst = uResult;
794 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
795}
796
797# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
798# endif /* !RT_ARCH_ARM64 */
799
800/*
801 * SBB
802 */
803
804IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
805{
806 if (!(*pfEFlags & X86_EFL_CF))
807 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
808 else
809 {
810 uint64_t uDst = *puDst;
811 uint64_t uResult = uDst - uSrc - 1;
812 *puDst = uResult;
813 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
814 }
815}
816
817# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
818
819IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
820{
821 if (!(*pfEFlags & X86_EFL_CF))
822 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
823 else
824 {
825 uint32_t uDst = *puDst;
826 uint32_t uResult = uDst - uSrc - 1;
827 *puDst = uResult;
828 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
829 }
830}
831
832
833IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
834{
835 if (!(*pfEFlags & X86_EFL_CF))
836 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
837 else
838 {
839 uint16_t uDst = *puDst;
840 uint16_t uResult = uDst - uSrc - 1;
841 *puDst = uResult;
842 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
843 }
844}
845
846
847IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
848{
849 if (!(*pfEFlags & X86_EFL_CF))
850 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
851 else
852 {
853 uint8_t uDst = *puDst;
854 uint8_t uResult = uDst - uSrc - 1;
855 *puDst = uResult;
856 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
857 }
858}
859
860# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
861
862
863/*
864 * OR
865 */
866
867IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
868{
869 uint64_t uResult = *puDst | uSrc;
870 *puDst = uResult;
871 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
872}
873
874# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
875
876IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
877{
878 uint32_t uResult = *puDst | uSrc;
879 *puDst = uResult;
880 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
881}
882
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
885{
886 uint16_t uResult = *puDst | uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
893{
894 uint8_t uResult = *puDst | uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
897}
898
899# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
900
901/*
902 * XOR
903 */
904
905IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
906{
907 uint64_t uResult = *puDst ^ uSrc;
908 *puDst = uResult;
909 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
910}
911
912# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
913
914IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
915{
916 uint32_t uResult = *puDst ^ uSrc;
917 *puDst = uResult;
918 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
919}
920
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
923{
924 uint16_t uResult = *puDst ^ uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
931{
932 uint8_t uResult = *puDst ^ uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
935}
936
937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
938
939/*
940 * AND
941 */
942
943IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
944{
945 uint64_t const uResult = *puDst & uSrc;
946 *puDst = uResult;
947 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
948}
949
950# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
953{
954 uint32_t const uResult = *puDst & uSrc;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
961{
962 uint16_t const uResult = *puDst & uSrc;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
965}
966
967
968IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
969{
970 uint8_t const uResult = *puDst & uSrc;
971 *puDst = uResult;
972 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
973}
974
975# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
976#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
977
978/*
979 * ANDN (BMI1 instruction)
980 */
981
982IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
983{
984 uint64_t const uResult = ~uSrc1 & uSrc2;
985 *puDst = uResult;
986 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
987}
988
989
990IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
991{
992 uint32_t const uResult = ~uSrc1 & uSrc2;
993 *puDst = uResult;
994 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
995}
996
997
998#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
999IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1000{
1001 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1002}
1003#endif
1004
1005
1006#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1007IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1008{
1009 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1010}
1011#endif
1012
1013#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1014
1015/*
1016 * CMP
1017 */
1018
1019IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1020{
1021 uint64_t uDstTmp = *puDst;
1022 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1023}
1024
1025# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1026
1027IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1028{
1029 uint32_t uDstTmp = *puDst;
1030 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1031}
1032
1033
1034IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1035{
1036 uint16_t uDstTmp = *puDst;
1037 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1038}
1039
1040
1041IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1042{
1043 uint8_t uDstTmp = *puDst;
1044 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1045}
1046
1047# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1048
1049/*
1050 * TEST
1051 */
1052
1053IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1054{
1055 uint64_t uResult = *puDst & uSrc;
1056 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 64, 0);
1057}
1058
1059# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1060
1061IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1062{
1063 uint32_t uResult = *puDst & uSrc;
1064 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 32, 0);
1065}
1066
1067
1068IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1069{
1070 uint16_t uResult = *puDst & uSrc;
1071 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 16, 0);
1072}
1073
1074
1075IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1076{
1077 uint8_t uResult = *puDst & uSrc;
1078 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(pfEFlags, uResult, 8, 0);
1079}
1080
1081# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1082
1083
1084/*
1085 * LOCK prefixed variants of the above
1086 */
1087
1088/** 64-bit locked binary operand operation. */
1089# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1090 do { \
1091 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1092 uint ## a_cBitsWidth ## _t uTmp; \
1093 uint32_t fEflTmp; \
1094 do \
1095 { \
1096 uTmp = uOld; \
1097 fEflTmp = *pfEFlags; \
1098 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1099 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1100 *pfEFlags = fEflTmp; \
1101 } while (0)
1102
1103
1104#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1105 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1106 uint ## a_cBitsWidth ## _t uSrc, \
1107 uint32_t *pfEFlags)) \
1108 { \
1109 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1110 }
1111
1112EMIT_LOCKED_BIN_OP(add, 64)
1113EMIT_LOCKED_BIN_OP(adc, 64)
1114EMIT_LOCKED_BIN_OP(sub, 64)
1115EMIT_LOCKED_BIN_OP(sbb, 64)
1116EMIT_LOCKED_BIN_OP(or, 64)
1117EMIT_LOCKED_BIN_OP(xor, 64)
1118EMIT_LOCKED_BIN_OP(and, 64)
1119# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1120EMIT_LOCKED_BIN_OP(add, 32)
1121EMIT_LOCKED_BIN_OP(adc, 32)
1122EMIT_LOCKED_BIN_OP(sub, 32)
1123EMIT_LOCKED_BIN_OP(sbb, 32)
1124EMIT_LOCKED_BIN_OP(or, 32)
1125EMIT_LOCKED_BIN_OP(xor, 32)
1126EMIT_LOCKED_BIN_OP(and, 32)
1127
1128EMIT_LOCKED_BIN_OP(add, 16)
1129EMIT_LOCKED_BIN_OP(adc, 16)
1130EMIT_LOCKED_BIN_OP(sub, 16)
1131EMIT_LOCKED_BIN_OP(sbb, 16)
1132EMIT_LOCKED_BIN_OP(or, 16)
1133EMIT_LOCKED_BIN_OP(xor, 16)
1134EMIT_LOCKED_BIN_OP(and, 16)
1135
1136EMIT_LOCKED_BIN_OP(add, 8)
1137EMIT_LOCKED_BIN_OP(adc, 8)
1138EMIT_LOCKED_BIN_OP(sub, 8)
1139EMIT_LOCKED_BIN_OP(sbb, 8)
1140EMIT_LOCKED_BIN_OP(or, 8)
1141EMIT_LOCKED_BIN_OP(xor, 8)
1142EMIT_LOCKED_BIN_OP(and, 8)
1143# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1144
1145
1146/*
1147 * Bit operations (same signature as above).
1148 */
1149
1150/*
1151 * BT
1152 */
1153
1154IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1155{
1156 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1157 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1158 Assert(uSrc < 64);
1159 uint64_t uDst = *puDst;
1160 if (uDst & RT_BIT_64(uSrc))
1161 *pfEFlags |= X86_EFL_CF;
1162 else
1163 *pfEFlags &= ~X86_EFL_CF;
1164}
1165
1166# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 32);
1173 uint32_t uDst = *puDst;
1174 if (uDst & RT_BIT_32(uSrc))
1175 *pfEFlags |= X86_EFL_CF;
1176 else
1177 *pfEFlags &= ~X86_EFL_CF;
1178}
1179
1180IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1181{
1182 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1183 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1184 Assert(uSrc < 16);
1185 uint16_t uDst = *puDst;
1186 if (uDst & RT_BIT_32(uSrc))
1187 *pfEFlags |= X86_EFL_CF;
1188 else
1189 *pfEFlags &= ~X86_EFL_CF;
1190}
1191
1192# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1193
1194/*
1195 * BTC
1196 */
1197
1198IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1199{
1200 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1201 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1202 Assert(uSrc < 64);
1203 uint64_t fMask = RT_BIT_64(uSrc);
1204 uint64_t uDst = *puDst;
1205 if (uDst & fMask)
1206 {
1207 uDst &= ~fMask;
1208 *puDst = uDst;
1209 *pfEFlags |= X86_EFL_CF;
1210 }
1211 else
1212 {
1213 uDst |= fMask;
1214 *puDst = uDst;
1215 *pfEFlags &= ~X86_EFL_CF;
1216 }
1217}
1218
1219# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1220
1221IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1222{
1223 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1224 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1225 Assert(uSrc < 32);
1226 uint32_t fMask = RT_BIT_32(uSrc);
1227 uint32_t uDst = *puDst;
1228 if (uDst & fMask)
1229 {
1230 uDst &= ~fMask;
1231 *puDst = uDst;
1232 *pfEFlags |= X86_EFL_CF;
1233 }
1234 else
1235 {
1236 uDst |= fMask;
1237 *puDst = uDst;
1238 *pfEFlags &= ~X86_EFL_CF;
1239 }
1240}
1241
1242
1243IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1244{
1245 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1246 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1247 Assert(uSrc < 16);
1248 uint16_t fMask = RT_BIT_32(uSrc);
1249 uint16_t uDst = *puDst;
1250 if (uDst & fMask)
1251 {
1252 uDst &= ~fMask;
1253 *puDst = uDst;
1254 *pfEFlags |= X86_EFL_CF;
1255 }
1256 else
1257 {
1258 uDst |= fMask;
1259 *puDst = uDst;
1260 *pfEFlags &= ~X86_EFL_CF;
1261 }
1262}
1263
1264# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1265
1266/*
1267 * BTR
1268 */
1269
1270IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1271{
1272 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1273 logical operation (AND/OR/whatever). */
1274 Assert(uSrc < 64);
1275 uint64_t fMask = RT_BIT_64(uSrc);
1276 uint64_t uDst = *puDst;
1277 if (uDst & fMask)
1278 {
1279 uDst &= ~fMask;
1280 *puDst = uDst;
1281 *pfEFlags |= X86_EFL_CF;
1282 }
1283 else
1284 *pfEFlags &= ~X86_EFL_CF;
1285}
1286
1287# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1288
1289IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1290{
1291 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1292 logical operation (AND/OR/whatever). */
1293 Assert(uSrc < 32);
1294 uint32_t fMask = RT_BIT_32(uSrc);
1295 uint32_t uDst = *puDst;
1296 if (uDst & fMask)
1297 {
1298 uDst &= ~fMask;
1299 *puDst = uDst;
1300 *pfEFlags |= X86_EFL_CF;
1301 }
1302 else
1303 *pfEFlags &= ~X86_EFL_CF;
1304}
1305
1306
1307IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1308{
1309 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1310 logical operation (AND/OR/whatever). */
1311 Assert(uSrc < 16);
1312 uint16_t fMask = RT_BIT_32(uSrc);
1313 uint16_t uDst = *puDst;
1314 if (uDst & fMask)
1315 {
1316 uDst &= ~fMask;
1317 *puDst = uDst;
1318 *pfEFlags |= X86_EFL_CF;
1319 }
1320 else
1321 *pfEFlags &= ~X86_EFL_CF;
1322}
1323
1324# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1325
1326/*
1327 * BTS
1328 */
1329
1330IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1331{
1332 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1333 logical operation (AND/OR/whatever). */
1334 Assert(uSrc < 64);
1335 uint64_t fMask = RT_BIT_64(uSrc);
1336 uint64_t uDst = *puDst;
1337 if (uDst & fMask)
1338 *pfEFlags |= X86_EFL_CF;
1339 else
1340 {
1341 uDst |= fMask;
1342 *puDst = uDst;
1343 *pfEFlags &= ~X86_EFL_CF;
1344 }
1345}
1346
1347# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1348
1349IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1350{
1351 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1352 logical operation (AND/OR/whatever). */
1353 Assert(uSrc < 32);
1354 uint32_t fMask = RT_BIT_32(uSrc);
1355 uint32_t uDst = *puDst;
1356 if (uDst & fMask)
1357 *pfEFlags |= X86_EFL_CF;
1358 else
1359 {
1360 uDst |= fMask;
1361 *puDst = uDst;
1362 *pfEFlags &= ~X86_EFL_CF;
1363 }
1364}
1365
1366
1367IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1368{
1369 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1370 logical operation (AND/OR/whatever). */
1371 Assert(uSrc < 16);
1372 uint16_t fMask = RT_BIT_32(uSrc);
1373 uint32_t uDst = *puDst;
1374 if (uDst & fMask)
1375 *pfEFlags |= X86_EFL_CF;
1376 else
1377 {
1378 uDst |= fMask;
1379 *puDst = uDst;
1380 *pfEFlags &= ~X86_EFL_CF;
1381 }
1382}
1383
1384# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1385
1386
1387EMIT_LOCKED_BIN_OP(btc, 64)
1388EMIT_LOCKED_BIN_OP(btr, 64)
1389EMIT_LOCKED_BIN_OP(bts, 64)
1390# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1391EMIT_LOCKED_BIN_OP(btc, 32)
1392EMIT_LOCKED_BIN_OP(btr, 32)
1393EMIT_LOCKED_BIN_OP(bts, 32)
1394
1395EMIT_LOCKED_BIN_OP(btc, 16)
1396EMIT_LOCKED_BIN_OP(btr, 16)
1397EMIT_LOCKED_BIN_OP(bts, 16)
1398# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1399
1400
1401/*
1402 * Helpers for BSR and BSF.
1403 *
1404 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1405 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1406 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1407 * but we restrict ourselves to emulating these recent marchs.
1408 */
1409#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1410 unsigned iBit = (a_iBit); \
1411 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1412 if (iBit) \
1413 { \
1414 *puDst = --iBit; \
1415 fEfl |= g_afParity[iBit]; \
1416 } \
1417 else \
1418 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1419 *pfEFlags = fEfl; \
1420 } while (0)
1421#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1422 unsigned const iBit = (a_iBit); \
1423 if (iBit) \
1424 { \
1425 *puDst = iBit - 1; \
1426 *pfEFlags &= ~X86_EFL_ZF; \
1427 } \
1428 else \
1429 *pfEFlags |= X86_EFL_ZF; \
1430 } while (0)
1431
1432
1433/*
1434 * BSF - first (least significant) bit set
1435 */
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1447{
1448 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1449}
1450
1451# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1464{
1465 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1466}
1467
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1480{
1481 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1482}
1483
1484# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1485
1486
1487/*
1488 * BSR - last (most significant) bit set
1489 */
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1501{
1502 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1503}
1504
1505# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1518{
1519 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1520}
1521
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1534{
1535 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1536}
1537
1538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1539
1540
1541/*
1542 * Helpers for LZCNT and TZCNT.
1543 */
1544#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1545 unsigned const uResult = (a_uResult); \
1546 *(a_puDst) = uResult; \
1547 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1548 if (uResult) \
1549 fEfl |= g_afParity[uResult]; \
1550 else \
1551 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1552 if (!a_uSrc) \
1553 fEfl |= X86_EFL_CF; \
1554 *(a_pfEFlags) = fEfl; \
1555 } while (0)
1556#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1557 unsigned const uResult = (a_uResult); \
1558 *(a_puDst) = uResult; \
1559 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1560 if (!uResult) \
1561 fEfl |= X86_EFL_ZF; \
1562 if (!a_uSrc) \
1563 fEfl |= X86_EFL_CF; \
1564 *(a_pfEFlags) = fEfl; \
1565 } while (0)
1566
1567
1568/*
1569 * LZCNT - count leading zero bits.
1570 */
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1582{
1583 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1584}
1585
1586# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1599{
1600 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1601}
1602
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1615{
1616 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1617}
1618
1619# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1620
1621
1622/*
1623 * TZCNT - count leading zero bits.
1624 */
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1636{
1637 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1638}
1639
1640# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1653{
1654 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1655}
1656
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1669{
1670 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1671}
1672
1673# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1674#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1675
1676/*
1677 * BEXTR (BMI1 instruction)
1678 */
1679#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1680IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1681 a_Type uSrc2, uint32_t *pfEFlags)) \
1682{ \
1683 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1684 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1685 a_Type uResult; \
1686 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1687 if (iFirstBit < a_cBits) \
1688 { \
1689 uResult = uSrc1 >> iFirstBit; \
1690 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1691 if (cBits < a_cBits) \
1692 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1693 *puDst = uResult; \
1694 if (!uResult) \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 else \
1698 { \
1699 *puDst = uResult = 0; \
1700 fEfl |= X86_EFL_ZF; \
1701 } \
1702 /** @todo complete flag calculations. */ \
1703 *pfEFlags = fEfl; \
1704}
1705
1706EMIT_BEXTR(64, uint64_t, _fallback)
1707EMIT_BEXTR(32, uint32_t, _fallback)
1708#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1709EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1710#endif
1711#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1712EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1713#endif
1714
1715/*
1716 * BLSR (BMI1 instruction)
1717 */
1718#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1719IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1720{ \
1721 uint32_t fEfl1 = *pfEFlags; \
1722 uint32_t fEfl2 = fEfl1; \
1723 *puDst = uSrc; \
1724 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1725 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1726 \
1727 /* AMD: The carry flag is from the SUB operation. */ \
1728 /* 10890xe: PF always cleared? */ \
1729 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1730 fEfl2 |= fEfl1 & X86_EFL_CF; \
1731 *pfEFlags = fEfl2; \
1732}
1733
1734EMIT_BLSR(64, uint64_t, _fallback)
1735EMIT_BLSR(32, uint32_t, _fallback)
1736#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1737EMIT_BLSR(64, uint64_t, RT_NOTHING)
1738#endif
1739#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1740EMIT_BLSR(32, uint32_t, RT_NOTHING)
1741#endif
1742
1743/*
1744 * BLSMSK (BMI1 instruction)
1745 */
1746#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1747IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1748{ \
1749 uint32_t fEfl1 = *pfEFlags; \
1750 uint32_t fEfl2 = fEfl1; \
1751 *puDst = uSrc; \
1752 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1753 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1754 \
1755 /* AMD: The carry flag is from the SUB operation. */ \
1756 /* 10890xe: PF always cleared? */ \
1757 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1758 fEfl2 |= fEfl1 & X86_EFL_CF; \
1759 *pfEFlags = fEfl2; \
1760}
1761
1762EMIT_BLSMSK(64, uint64_t, _fallback)
1763EMIT_BLSMSK(32, uint32_t, _fallback)
1764#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1765EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1766#endif
1767#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1768EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1769#endif
1770
1771/*
1772 * BLSI (BMI1 instruction)
1773 */
1774#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1775IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1776{ \
1777 uint32_t fEfl1 = *pfEFlags; \
1778 uint32_t fEfl2 = fEfl1; \
1779 *puDst = uSrc; \
1780 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1781 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1782 \
1783 /* AMD: The carry flag is from the SUB operation. */ \
1784 /* 10890xe: PF always cleared? */ \
1785 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1786 fEfl2 |= fEfl1 & X86_EFL_CF; \
1787 *pfEFlags = fEfl2; \
1788}
1789
1790EMIT_BLSI(64, uint64_t, _fallback)
1791EMIT_BLSI(32, uint32_t, _fallback)
1792#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1793EMIT_BLSI(64, uint64_t, RT_NOTHING)
1794#endif
1795#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1796EMIT_BLSI(32, uint32_t, RT_NOTHING)
1797#endif
1798
1799/*
1800 * BZHI (BMI2 instruction)
1801 */
1802#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1803IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1804 a_Type uSrc2, uint32_t *pfEFlags)) \
1805{ \
1806 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1807 a_Type uResult; \
1808 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1809 if (iFirstBit < a_cBits) \
1810 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1811 else \
1812 { \
1813 uResult = uSrc1; \
1814 fEfl |= X86_EFL_CF; \
1815 } \
1816 *puDst = uResult; \
1817 fEfl |= X86_EFL_CALC_ZF(uResult); \
1818 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1819 *pfEFlags = fEfl; \
1820}
1821
1822EMIT_BZHI(64, uint64_t, _fallback)
1823EMIT_BZHI(32, uint32_t, _fallback)
1824#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1825EMIT_BZHI(64, uint64_t, RT_NOTHING)
1826#endif
1827#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1828EMIT_BZHI(32, uint32_t, RT_NOTHING)
1829#endif
1830
1831/*
1832 * POPCNT
1833 */
1834RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1835{
1836 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1837 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1838 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1839 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1840};
1841
1842/** @todo Use native popcount where possible and employ some more efficient
1843 * algorithm here (or in asm.h fallback)! */
1844
1845DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1846{
1847 return g_abBitCounts6[ u16 & 0x3f]
1848 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1849 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1850}
1851
1852DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1853{
1854 return g_abBitCounts6[ u32 & 0x3f]
1855 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1856 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1857 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1858 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1859 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1860}
1861
1862DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1863{
1864 return g_abBitCounts6[ u64 & 0x3f]
1865 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1870 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1871 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1872 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1873 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1874 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1875}
1876
1877#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1878IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1879{ \
1880 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1881 a_Type uResult; \
1882 if (uSrc) \
1883 uResult = iemPopCountU ## a_cBits(uSrc); \
1884 else \
1885 { \
1886 fEfl |= X86_EFL_ZF; \
1887 uResult = 0; \
1888 } \
1889 *puDst = uResult; \
1890 *pfEFlags = fEfl; \
1891}
1892
1893EMIT_POPCNT(64, uint64_t, _fallback)
1894EMIT_POPCNT(32, uint32_t, _fallback)
1895EMIT_POPCNT(16, uint16_t, _fallback)
1896#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1897EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1898#endif
1899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1900EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1901EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1902#endif
1903
1904
1905#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1906
1907/*
1908 * XCHG
1909 */
1910
1911IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1912{
1913#if ARCH_BITS >= 64
1914 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1915#else
1916 uint64_t uOldMem = *puMem;
1917 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1918 ASMNopPause();
1919 *puReg = uOldMem;
1920#endif
1921}
1922
1923# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1924
1925IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1926{
1927 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1928}
1929
1930
1931IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1932{
1933 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1934}
1935
1936
1937IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1938{
1939 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1940}
1941
1942# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1943
1944
1945/* Unlocked variants for fDisregardLock mode: */
1946
1947IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1948{
1949 uint64_t const uOld = *puMem;
1950 *puMem = *puReg;
1951 *puReg = uOld;
1952}
1953
1954# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1955
1956IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1957{
1958 uint32_t const uOld = *puMem;
1959 *puMem = *puReg;
1960 *puReg = uOld;
1961}
1962
1963
1964IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1965{
1966 uint16_t const uOld = *puMem;
1967 *puMem = *puReg;
1968 *puReg = uOld;
1969}
1970
1971
1972IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1973{
1974 uint8_t const uOld = *puMem;
1975 *puMem = *puReg;
1976 *puReg = uOld;
1977}
1978
1979# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1980
1981
1982/*
1983 * XADD and LOCK XADD.
1984 */
1985#define EMIT_XADD(a_cBitsWidth, a_Type) \
1986IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1987{ \
1988 a_Type uDst = *puDst; \
1989 a_Type uResult = uDst; \
1990 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1991 *puDst = uResult; \
1992 *puReg = uDst; \
1993} \
1994\
1995IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1996{ \
1997 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1998 a_Type uResult; \
1999 uint32_t fEflTmp; \
2000 do \
2001 { \
2002 uResult = uOld; \
2003 fEflTmp = *pfEFlags; \
2004 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2005 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2006 *puReg = uOld; \
2007 *pfEFlags = fEflTmp; \
2008}
2009EMIT_XADD(64, uint64_t)
2010# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2011EMIT_XADD(32, uint32_t)
2012EMIT_XADD(16, uint16_t)
2013EMIT_XADD(8, uint8_t)
2014# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2015
2016#endif
2017
2018/*
2019 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2020 *
2021 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2022 * instructions are emulated as locked.
2023 */
2024#if defined(IEM_WITHOUT_ASSEMBLY)
2025
2026IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2027{
2028 uint8_t uOld = *puAl;
2029 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2030 Assert(*puAl == uOld);
2031 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2032}
2033
2034
2035IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2036{
2037 uint16_t uOld = *puAx;
2038 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2039 Assert(*puAx == uOld);
2040 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2041}
2042
2043
2044IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2045{
2046 uint32_t uOld = *puEax;
2047 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2048 Assert(*puEax == uOld);
2049 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2050}
2051
2052
2053# if ARCH_BITS == 32
2054IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2055# else
2056IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2057# endif
2058{
2059# if ARCH_BITS == 32
2060 uint64_t const uSrcReg = *puSrcReg;
2061# endif
2062 uint64_t uOld = *puRax;
2063 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2064 Assert(*puRax == uOld);
2065 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2066}
2067
2068
2069IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2070 uint32_t *pEFlags))
2071{
2072 uint64_t const uNew = pu64EbxEcx->u;
2073 uint64_t const uOld = pu64EaxEdx->u;
2074 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2075 {
2076 Assert(pu64EaxEdx->u == uOld);
2077 *pEFlags |= X86_EFL_ZF;
2078 }
2079 else
2080 *pEFlags &= ~X86_EFL_ZF;
2081}
2082
2083
2084# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2085IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2086 uint32_t *pEFlags))
2087{
2088# ifdef VBOX_STRICT
2089 RTUINT128U const uOld = *pu128RaxRdx;
2090# endif
2091# if defined(RT_ARCH_AMD64)
2092 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2093 &pu128RaxRdx->u))
2094# else
2095 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2096# endif
2097 {
2098 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2099 *pEFlags |= X86_EFL_ZF;
2100 }
2101 else
2102 *pEFlags &= ~X86_EFL_ZF;
2103}
2104# endif
2105
2106#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2107
2108# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2109IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2110 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2111{
2112 RTUINT128U u128Tmp = *pu128Dst;
2113 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2114 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2115 {
2116 *pu128Dst = *pu128RbxRcx;
2117 *pEFlags |= X86_EFL_ZF;
2118 }
2119 else
2120 {
2121 *pu128RaxRdx = u128Tmp;
2122 *pEFlags &= ~X86_EFL_ZF;
2123 }
2124}
2125#endif /* !RT_ARCH_ARM64 */
2126
2127#if defined(IEM_WITHOUT_ASSEMBLY)
2128
2129/* Unlocked versions mapped to the locked ones: */
2130
2131IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2132{
2133 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2134}
2135
2136
2137IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2138{
2139# if 0
2140 /* If correctly aligned, used the locked variation. */
2141 if (!((uintptr_t)pu16Dst & 1))
2142 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2143 else
2144# endif
2145 {
2146 /* Otherwise emulate it as best as we can. */
2147 uint16_t const uOld = *puAx;
2148 uint16_t const uDst = *pu16Dst;
2149 if (uOld == uDst)
2150 {
2151 *pu16Dst = uSrcReg;
2152 iemAImpl_cmp_u16(&uOld, uOld, pEFlags);
2153 }
2154 else
2155 {
2156 *puAx = uDst;
2157 iemAImpl_cmp_u16(&uOld, uDst, pEFlags);
2158 }
2159 }
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2164{
2165# if 0
2166 /* If correctly aligned, used the locked variation. */
2167 if (!((uintptr_t)pu32Dst & 3))
2168 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2169 else
2170# endif
2171 {
2172 /* Otherwise emulate it as best as we can. */
2173 uint32_t const uOld = *puEax;
2174 uint32_t const uDst = *pu32Dst;
2175 if (uOld == uDst)
2176 {
2177 *pu32Dst = uSrcReg;
2178 iemAImpl_cmp_u32(&uOld, uOld, pEFlags);
2179 }
2180 else
2181 {
2182 *puEax = uDst;
2183 iemAImpl_cmp_u32(&uOld, uDst, pEFlags);
2184 }
2185 }
2186}
2187
2188
2189# if ARCH_BITS == 32
2190IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2191{
2192# if 0
2193 /* If correctly aligned, used the locked variation. */
2194 if (!((uintptr_t)pu32Dst & 7))
2195 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2196 else
2197# endif
2198 {
2199 /* Otherwise emulate it as best as we can. */
2200 uint64_t const uOld = *puRax;
2201 uint64_t const uSrc = *puSrcReg;
2202 uint64_t const uDst = *pu64Dst;
2203 if (uOld == uDst)
2204 {
2205 *pu64Dst = uSrc;
2206 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2207 }
2208 else
2209 {
2210 *puRax = uDst;
2211 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2212 }
2213 }
2214}
2215# else /* ARCH_BITS != 32 */
2216IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2217{
2218# if 0
2219 /* If correctly aligned, used the locked variation. */
2220 if (!((uintptr_t)pu64Dst & 7))
2221 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2222 else
2223# endif
2224 {
2225 /* Otherwise emulate it as best as we can. */
2226 uint64_t const uOld = *puRax;
2227 uint64_t const uDst = *pu64Dst;
2228 if (uOld == uDst)
2229 {
2230 *pu64Dst = uSrcReg;
2231 iemAImpl_cmp_u64(&uOld, uOld, pEFlags);
2232 }
2233 else
2234 {
2235 *puRax = uDst;
2236 iemAImpl_cmp_u64(&uOld, uDst, pEFlags);
2237 }
2238 }
2239}
2240# endif /* ARCH_BITS != 32 */
2241
2242
2243IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2244{
2245# if 0
2246 /* If correctly aligned, used the locked variation. */
2247 if (!((uintptr_t)pu64Dst & 7))
2248 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2249 else
2250# endif
2251 {
2252 /* Otherwise emulate it as best as we can. */
2253 uint64_t const uNew = pu64EbxEcx->u;
2254 uint64_t const uOld = pu64EaxEdx->u;
2255 uint64_t const uDst = *pu64Dst;
2256 if (uDst == uOld)
2257 {
2258 *pu64Dst = uNew;
2259 *pEFlags |= X86_EFL_ZF;
2260 }
2261 else
2262 {
2263 pu64EaxEdx->u = uDst;
2264 *pEFlags &= ~X86_EFL_ZF;
2265 }
2266 }
2267}
2268
2269
2270IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2271 uint32_t *pEFlags))
2272{
2273# if 0
2274 /* If correctly aligned, used the locked variation. */
2275 if (!((uintptr_t)pu64Dst & 15))
2276 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2277 else
2278# endif
2279 {
2280 /* Otherwise emulate it as best as we can. */
2281# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2282 uint128_t const uNew = pu128RbxRcx->u;
2283 uint128_t const uOld = pu128RaxRdx->u;
2284 uint128_t const uDst = pu128Dst->u;
2285 if (uDst == uOld)
2286 {
2287 pu128Dst->u = uNew;
2288 *pEFlags |= X86_EFL_ZF;
2289 }
2290 else
2291 {
2292 pu128RaxRdx->u = uDst;
2293 *pEFlags &= ~X86_EFL_ZF;
2294 }
2295# else
2296 RTUINT128U const uNew = *pu128RbxRcx;
2297 RTUINT128U const uOld = *pu128RaxRdx;
2298 RTUINT128U const uDst = *pu128Dst;
2299 if ( uDst.s.Lo == uOld.s.Lo
2300 && uDst.s.Hi == uOld.s.Hi)
2301 {
2302 *pu128Dst = uNew;
2303 *pEFlags |= X86_EFL_ZF;
2304 }
2305 else
2306 {
2307 *pu128RaxRdx = uDst;
2308 *pEFlags &= ~X86_EFL_ZF;
2309 }
2310# endif
2311 }
2312}
2313
2314#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2315
2316#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2317 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2318
2319/*
2320 * MUL, IMUL, DIV and IDIV helpers.
2321 *
2322 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2323 * division step so we can select between using C operators and
2324 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2325 *
2326 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2327 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2328 * input loads and the result storing.
2329 */
2330
2331DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2332{
2333# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2334 pQuotient->s.Lo = 0;
2335 pQuotient->s.Hi = 0;
2336# endif
2337 RTUINT128U Divisor;
2338 Divisor.s.Lo = u64Divisor;
2339 Divisor.s.Hi = 0;
2340 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2341}
2342
2343# define DIV_LOAD(a_Dividend) \
2344 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2345# define DIV_LOAD_U8(a_Dividend) \
2346 a_Dividend.u = *puAX
2347
2348# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2349# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2350
2351# define MUL_LOAD_F1() *puA
2352# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2353
2354# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2355# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2356
2357# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2358 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2359# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2360 RTUInt128AssignNeg(&(a_Value))
2361
2362# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2363 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2364# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2365 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2366
2367# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2368 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2369 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2370# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2371 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2372
2373
2374/*
2375 * MUL
2376 */
2377# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2378IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2379{ \
2380 RTUINT ## a_cBitsWidth2x ## U Result; \
2381 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2382 a_fnStore(Result); \
2383 \
2384 /* Calc EFLAGS: */ \
2385 uint32_t fEfl = *pfEFlags; \
2386 if (a_fIntelFlags) \
2387 { /* Intel: 6700K and 10980XE behavior */ \
2388 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2389 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2390 fEfl |= X86_EFL_SF; \
2391 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2392 if (Result.s.Hi != 0) \
2393 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2394 } \
2395 else \
2396 { /* AMD: 3990X */ \
2397 if (Result.s.Hi != 0) \
2398 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2399 else \
2400 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2401 } \
2402 *pfEFlags = fEfl; \
2403 return 0; \
2404} \
2405
2406# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2407 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2408 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2409 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2410
2411# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2412EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2413 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2414# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2415EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2416 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2417EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2418 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2419EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2420 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2421# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2422# endif /* !DOXYGEN_RUNNING */
2423
2424/*
2425 * MULX
2426 */
2427# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2428IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2429 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2430{ \
2431 RTUINT ## a_cBitsWidth2x ## U Result; \
2432 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2433 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2434 *puDst1 = Result.s.Hi; \
2435} \
2436
2437# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2438EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2439EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2440# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2441EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2442EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2443# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2444# endif /* !DOXYGEN_RUNNING */
2445
2446
2447/*
2448 * IMUL
2449 *
2450 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2451 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2452 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2453 */
2454# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2455 a_Suffix, a_fIntelFlags) \
2456IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2457{ \
2458 RTUINT ## a_cBitsWidth2x ## U Result; \
2459 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2460 \
2461 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2462 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2463 { \
2464 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2465 { \
2466 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2467 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2468 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2469 } \
2470 else \
2471 { \
2472 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2473 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2474 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2475 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2476 a_fnNeg(Result, a_cBitsWidth2x); \
2477 } \
2478 } \
2479 else \
2480 { \
2481 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2482 { \
2483 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2484 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2485 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2486 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2487 a_fnNeg(Result, a_cBitsWidth2x); \
2488 } \
2489 else \
2490 { \
2491 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2492 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2493 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2494 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2495 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2496 } \
2497 } \
2498 a_fnStore(Result); \
2499 \
2500 if (a_fIntelFlags) \
2501 { \
2502 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2503 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2504 fEfl |= X86_EFL_SF; \
2505 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2506 } \
2507 *pfEFlags = fEfl; \
2508 return 0; \
2509}
2510# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2511 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2512 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2513 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2514
2515# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2516EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2517 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2518# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2519EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2520 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2521EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2522 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2523EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2524 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2525# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2526# endif /* !DOXYGEN_RUNNING */
2527
2528
2529/*
2530 * IMUL with two operands are mapped onto the three operand variant, ignoring
2531 * the high part of the product.
2532 */
2533# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2534IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2535{ \
2536 a_uType uIgn; \
2537 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2538} \
2539\
2540IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2541{ \
2542 a_uType uIgn; \
2543 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2544} \
2545\
2546IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2547{ \
2548 a_uType uIgn; \
2549 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2550}
2551
2552EMIT_IMUL_TWO(64, uint64_t)
2553# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2554EMIT_IMUL_TWO(32, uint32_t)
2555EMIT_IMUL_TWO(16, uint16_t)
2556# endif
2557
2558
2559/*
2560 * DIV
2561 */
2562# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2563 a_Suffix, a_fIntelFlags) \
2564IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2565{ \
2566 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2567 a_fnLoad(Dividend); \
2568 if ( uDivisor != 0 \
2569 && Dividend.s.Hi < uDivisor) \
2570 { \
2571 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2572 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2573 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2574 \
2575 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2576 if (!a_fIntelFlags) \
2577 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2578 return 0; \
2579 } \
2580 /* #DE */ \
2581 return -1; \
2582}
2583# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2584 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2585 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2586 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2587
2588# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2589EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2590 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2591# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2592EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2593 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2594EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2595 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2596EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2597 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2598# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2599# endif /* !DOXYGEN_RUNNING */
2600
2601
2602/*
2603 * IDIV
2604 *
2605 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2606 * set AF and clear PF, ZF and SF just like it does for DIV.
2607 *
2608 */
2609# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2610 a_Suffix, a_fIntelFlags) \
2611IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2612{ \
2613 /* Note! Skylake leaves all flags alone. */ \
2614 \
2615 /** @todo overflow checks */ \
2616 if (uDivisor != 0) \
2617 { \
2618 /* \
2619 * Convert to unsigned division. \
2620 */ \
2621 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2622 a_fnLoad(Dividend); \
2623 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2624 if (fSignedDividend) \
2625 a_fnNeg(Dividend, a_cBitsWidth2x); \
2626 \
2627 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2628 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2629 uDivisorPositive = uDivisor; \
2630 else \
2631 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2632 \
2633 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2634 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2635 \
2636 /* \
2637 * Setup the result, checking for overflows. \
2638 */ \
2639 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2640 { \
2641 if (!fSignedDividend) \
2642 { \
2643 /* Positive divisor, positive dividend => result positive. */ \
2644 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2645 { \
2646 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2647 if (!a_fIntelFlags) \
2648 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2649 return 0; \
2650 } \
2651 } \
2652 else \
2653 { \
2654 /* Positive divisor, negative dividend => result negative. */ \
2655 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2656 { \
2657 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2658 if (!a_fIntelFlags) \
2659 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2660 return 0; \
2661 } \
2662 } \
2663 } \
2664 else \
2665 { \
2666 if (!fSignedDividend) \
2667 { \
2668 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2669 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2670 { \
2671 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2672 if (!a_fIntelFlags) \
2673 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2674 return 0; \
2675 } \
2676 } \
2677 else \
2678 { \
2679 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2680 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2681 { \
2682 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2683 if (!a_fIntelFlags) \
2684 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2685 return 0; \
2686 } \
2687 } \
2688 } \
2689 } \
2690 /* #DE */ \
2691 return -1; \
2692}
2693# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2694 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2695 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2696 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2697
2698# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2699EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2700 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2701# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2702EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2703 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2704EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2705 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2706EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2707 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2708# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2709# endif /* !DOXYGEN_RUNNING */
2710
2711#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2712
2713
2714/*********************************************************************************************************************************
2715* Unary operations. *
2716*********************************************************************************************************************************/
2717#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2718
2719/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2720 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2721 *
2722 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2723 * borrowing in arithmetic loops on intel 8008).
2724 *
2725 * @returns Status bits.
2726 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2727 * @param a_uResult Unsigned result value.
2728 * @param a_uDst The original destination value (for AF calc).
2729 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2730 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2731 */
2732#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2733 do { \
2734 uint32_t fEflTmp = *(a_pfEFlags); \
2735 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2736 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2737 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2738 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2739 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2740 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2741 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2742 *(a_pfEFlags) = fEflTmp; \
2743 } while (0)
2744
2745/*
2746 * INC
2747 */
2748
2749IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2750{
2751 uint64_t uDst = *puDst;
2752 uint64_t uResult = uDst + 1;
2753 *puDst = uResult;
2754 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2755}
2756
2757# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2758
2759IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2760{
2761 uint32_t uDst = *puDst;
2762 uint32_t uResult = uDst + 1;
2763 *puDst = uResult;
2764 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2765}
2766
2767
2768IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2769{
2770 uint16_t uDst = *puDst;
2771 uint16_t uResult = uDst + 1;
2772 *puDst = uResult;
2773 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2774}
2775
2776IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2777{
2778 uint8_t uDst = *puDst;
2779 uint8_t uResult = uDst + 1;
2780 *puDst = uResult;
2781 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2782}
2783
2784# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2785
2786
2787/*
2788 * DEC
2789 */
2790
2791IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2792{
2793 uint64_t uDst = *puDst;
2794 uint64_t uResult = uDst - 1;
2795 *puDst = uResult;
2796 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2797}
2798
2799# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2800
2801IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2802{
2803 uint32_t uDst = *puDst;
2804 uint32_t uResult = uDst - 1;
2805 *puDst = uResult;
2806 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2807}
2808
2809
2810IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2811{
2812 uint16_t uDst = *puDst;
2813 uint16_t uResult = uDst - 1;
2814 *puDst = uResult;
2815 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2816}
2817
2818
2819IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2820{
2821 uint8_t uDst = *puDst;
2822 uint8_t uResult = uDst - 1;
2823 *puDst = uResult;
2824 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2825}
2826
2827# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2828
2829
2830/*
2831 * NOT
2832 */
2833
2834IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2835{
2836 uint64_t uDst = *puDst;
2837 uint64_t uResult = ~uDst;
2838 *puDst = uResult;
2839 /* EFLAGS are not modified. */
2840 RT_NOREF_PV(pfEFlags);
2841}
2842
2843# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2844
2845IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2846{
2847 uint32_t uDst = *puDst;
2848 uint32_t uResult = ~uDst;
2849 *puDst = uResult;
2850 /* EFLAGS are not modified. */
2851 RT_NOREF_PV(pfEFlags);
2852}
2853
2854IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2855{
2856 uint16_t uDst = *puDst;
2857 uint16_t uResult = ~uDst;
2858 *puDst = uResult;
2859 /* EFLAGS are not modified. */
2860 RT_NOREF_PV(pfEFlags);
2861}
2862
2863IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2864{
2865 uint8_t uDst = *puDst;
2866 uint8_t uResult = ~uDst;
2867 *puDst = uResult;
2868 /* EFLAGS are not modified. */
2869 RT_NOREF_PV(pfEFlags);
2870}
2871
2872# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2873
2874
2875/*
2876 * NEG
2877 */
2878
2879/**
2880 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2881 *
2882 * @returns Status bits.
2883 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2884 * @param a_uResult Unsigned result value.
2885 * @param a_uDst The original destination value (for AF calc).
2886 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2887 */
2888#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2889 do { \
2890 uint32_t fEflTmp = *(a_pfEFlags); \
2891 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2892 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2893 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2894 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2895 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2896 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2897 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2898 *(a_pfEFlags) = fEflTmp; \
2899 } while (0)
2900
2901IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2902{
2903 uint64_t uDst = *puDst;
2904 uint64_t uResult = (uint64_t)0 - uDst;
2905 *puDst = uResult;
2906 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2907}
2908
2909# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2910
2911IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2912{
2913 uint32_t uDst = *puDst;
2914 uint32_t uResult = (uint32_t)0 - uDst;
2915 *puDst = uResult;
2916 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2917}
2918
2919
2920IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2921{
2922 uint16_t uDst = *puDst;
2923 uint16_t uResult = (uint16_t)0 - uDst;
2924 *puDst = uResult;
2925 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2926}
2927
2928
2929IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2930{
2931 uint8_t uDst = *puDst;
2932 uint8_t uResult = (uint8_t)0 - uDst;
2933 *puDst = uResult;
2934 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2935}
2936
2937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2938
2939/*
2940 * Locked variants.
2941 */
2942
2943/** Emit a function for doing a locked unary operand operation. */
2944# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2945 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2946 uint32_t *pfEFlags)) \
2947 { \
2948 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2949 uint ## a_cBitsWidth ## _t uTmp; \
2950 uint32_t fEflTmp; \
2951 do \
2952 { \
2953 uTmp = uOld; \
2954 fEflTmp = *pfEFlags; \
2955 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2956 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2957 *pfEFlags = fEflTmp; \
2958 }
2959
2960EMIT_LOCKED_UNARY_OP(inc, 64)
2961EMIT_LOCKED_UNARY_OP(dec, 64)
2962EMIT_LOCKED_UNARY_OP(not, 64)
2963EMIT_LOCKED_UNARY_OP(neg, 64)
2964# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2965EMIT_LOCKED_UNARY_OP(inc, 32)
2966EMIT_LOCKED_UNARY_OP(dec, 32)
2967EMIT_LOCKED_UNARY_OP(not, 32)
2968EMIT_LOCKED_UNARY_OP(neg, 32)
2969
2970EMIT_LOCKED_UNARY_OP(inc, 16)
2971EMIT_LOCKED_UNARY_OP(dec, 16)
2972EMIT_LOCKED_UNARY_OP(not, 16)
2973EMIT_LOCKED_UNARY_OP(neg, 16)
2974
2975EMIT_LOCKED_UNARY_OP(inc, 8)
2976EMIT_LOCKED_UNARY_OP(dec, 8)
2977EMIT_LOCKED_UNARY_OP(not, 8)
2978EMIT_LOCKED_UNARY_OP(neg, 8)
2979# endif
2980
2981#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2982
2983
2984/*********************************************************************************************************************************
2985* Shifting and Rotating *
2986*********************************************************************************************************************************/
2987
2988/*
2989 * ROL
2990 */
2991#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2992IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2993{ \
2994 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2995 if (cShift) \
2996 { \
2997 if (a_cBitsWidth < 32) \
2998 cShift &= a_cBitsWidth - 1; \
2999 a_uType const uDst = *puDst; \
3000 a_uType const uResult = a_fnHlp(uDst, cShift); \
3001 *puDst = uResult; \
3002 \
3003 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3004 it the same way as for 1 bit shifts. */ \
3005 AssertCompile(X86_EFL_CF_BIT == 0); \
3006 uint32_t fEfl = *pfEFlags; \
3007 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3008 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3009 fEfl |= fCarry; \
3010 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3011 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3012 else /* Intel 10980XE: According to the first sub-shift: */ \
3013 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3014 *pfEFlags = fEfl; \
3015 } \
3016}
3017
3018#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3020#endif
3021EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3022EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3026#endif
3027EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3028EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3029
3030DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3031{
3032 return (uValue << cShift) | (uValue >> (16 - cShift));
3033}
3034#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3035EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3036#endif
3037EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3038EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3039
3040DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3041{
3042 return (uValue << cShift) | (uValue >> (8 - cShift));
3043}
3044#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3045EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3046#endif
3047EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3048EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3049
3050
3051/*
3052 * ROR
3053 */
3054#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3055IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3056{ \
3057 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3058 if (cShift) \
3059 { \
3060 if (a_cBitsWidth < 32) \
3061 cShift &= a_cBitsWidth - 1; \
3062 a_uType const uDst = *puDst; \
3063 a_uType const uResult = a_fnHlp(uDst, cShift); \
3064 *puDst = uResult; \
3065 \
3066 /* Calc EFLAGS: */ \
3067 AssertCompile(X86_EFL_CF_BIT == 0); \
3068 uint32_t fEfl = *pfEFlags; \
3069 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3070 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3071 fEfl |= fCarry; \
3072 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3073 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3074 else /* Intel 10980XE: According to the first sub-shift: */ \
3075 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3076 *pfEFlags = fEfl; \
3077 } \
3078}
3079
3080#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3081EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3082#endif
3083EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3084EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3085
3086#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3087EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3088#endif
3089EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3090EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3091
3092DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3093{
3094 return (uValue >> cShift) | (uValue << (16 - cShift));
3095}
3096#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3097EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3098#endif
3099EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3100EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3101
3102DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3103{
3104 return (uValue >> cShift) | (uValue << (8 - cShift));
3105}
3106#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3107EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3108#endif
3109EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3110EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3111
3112
3113/*
3114 * RCL
3115 */
3116#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3117IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3118{ \
3119 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3120 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3121 cShift %= a_cBitsWidth + 1; \
3122 if (cShift) \
3123 { \
3124 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3125 cShift %= a_cBitsWidth + 1; \
3126 a_uType const uDst = *puDst; \
3127 a_uType uResult = uDst << cShift; \
3128 if (cShift > 1) \
3129 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3130 \
3131 AssertCompile(X86_EFL_CF_BIT == 0); \
3132 uint32_t fEfl = *pfEFlags; \
3133 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3134 uResult |= (a_uType)fInCarry << (cShift - 1); \
3135 \
3136 *puDst = uResult; \
3137 \
3138 /* Calc EFLAGS. */ \
3139 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3140 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3141 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3142 fEfl |= fOutCarry; \
3143 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3144 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3145 else /* Intel 10980XE: According to the first sub-shift: */ \
3146 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3147 *pfEFlags = fEfl; \
3148 } \
3149}
3150
3151#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3152EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3153#endif
3154EMIT_RCL(64, uint64_t, _intel, 1)
3155EMIT_RCL(64, uint64_t, _amd, 0)
3156
3157#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3158EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3159#endif
3160EMIT_RCL(32, uint32_t, _intel, 1)
3161EMIT_RCL(32, uint32_t, _amd, 0)
3162
3163#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3164EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3165#endif
3166EMIT_RCL(16, uint16_t, _intel, 1)
3167EMIT_RCL(16, uint16_t, _amd, 0)
3168
3169#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3170EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3171#endif
3172EMIT_RCL(8, uint8_t, _intel, 1)
3173EMIT_RCL(8, uint8_t, _amd, 0)
3174
3175
3176/*
3177 * RCR
3178 */
3179#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3180IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3181{ \
3182 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3183 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3184 cShift %= a_cBitsWidth + 1; \
3185 if (cShift) \
3186 { \
3187 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3188 cShift %= a_cBitsWidth + 1; \
3189 a_uType const uDst = *puDst; \
3190 a_uType uResult = uDst >> cShift; \
3191 if (cShift > 1) \
3192 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3193 \
3194 AssertCompile(X86_EFL_CF_BIT == 0); \
3195 uint32_t fEfl = *pfEFlags; \
3196 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3197 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3198 *puDst = uResult; \
3199 \
3200 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3201 it the same way as for 1 bit shifts. */ \
3202 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3203 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3204 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3205 fEfl |= fOutCarry; \
3206 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3207 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3208 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3209 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3210 *pfEFlags = fEfl; \
3211 } \
3212}
3213
3214#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3215EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3216#endif
3217EMIT_RCR(64, uint64_t, _intel, 1)
3218EMIT_RCR(64, uint64_t, _amd, 0)
3219
3220#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3221EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3222#endif
3223EMIT_RCR(32, uint32_t, _intel, 1)
3224EMIT_RCR(32, uint32_t, _amd, 0)
3225
3226#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3227EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3228#endif
3229EMIT_RCR(16, uint16_t, _intel, 1)
3230EMIT_RCR(16, uint16_t, _amd, 0)
3231
3232#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3233EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3234#endif
3235EMIT_RCR(8, uint8_t, _intel, 1)
3236EMIT_RCR(8, uint8_t, _amd, 0)
3237
3238
3239/*
3240 * SHL
3241 */
3242#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3243IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3244{ \
3245 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3246 if (cShift) \
3247 { \
3248 a_uType const uDst = *puDst; \
3249 a_uType uResult = uDst << cShift; \
3250 *puDst = uResult; \
3251 \
3252 /* Calc EFLAGS. */ \
3253 AssertCompile(X86_EFL_CF_BIT == 0); \
3254 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3255 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3256 fEfl |= fCarry; \
3257 if (!a_fIntelFlags) \
3258 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3259 else \
3260 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3261 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3262 fEfl |= X86_EFL_CALC_ZF(uResult); \
3263 fEfl |= g_afParity[uResult & 0xff]; \
3264 if (!a_fIntelFlags) \
3265 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3266 *pfEFlags = fEfl; \
3267 } \
3268}
3269
3270#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3271EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3272#endif
3273EMIT_SHL(64, uint64_t, _intel, 1)
3274EMIT_SHL(64, uint64_t, _amd, 0)
3275
3276#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3277EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3278#endif
3279EMIT_SHL(32, uint32_t, _intel, 1)
3280EMIT_SHL(32, uint32_t, _amd, 0)
3281
3282#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3283EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3284#endif
3285EMIT_SHL(16, uint16_t, _intel, 1)
3286EMIT_SHL(16, uint16_t, _amd, 0)
3287
3288#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3289EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3290#endif
3291EMIT_SHL(8, uint8_t, _intel, 1)
3292EMIT_SHL(8, uint8_t, _amd, 0)
3293
3294
3295/*
3296 * SHR
3297 */
3298#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3299IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3300{ \
3301 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3302 if (cShift) \
3303 { \
3304 a_uType const uDst = *puDst; \
3305 a_uType uResult = uDst >> cShift; \
3306 *puDst = uResult; \
3307 \
3308 /* Calc EFLAGS. */ \
3309 AssertCompile(X86_EFL_CF_BIT == 0); \
3310 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3311 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3312 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3313 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3314 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3315 fEfl |= X86_EFL_CALC_ZF(uResult); \
3316 fEfl |= g_afParity[uResult & 0xff]; \
3317 if (!a_fIntelFlags) \
3318 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3319 *pfEFlags = fEfl; \
3320 } \
3321}
3322
3323#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3324EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3325#endif
3326EMIT_SHR(64, uint64_t, _intel, 1)
3327EMIT_SHR(64, uint64_t, _amd, 0)
3328
3329#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3330EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3331#endif
3332EMIT_SHR(32, uint32_t, _intel, 1)
3333EMIT_SHR(32, uint32_t, _amd, 0)
3334
3335#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3336EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3337#endif
3338EMIT_SHR(16, uint16_t, _intel, 1)
3339EMIT_SHR(16, uint16_t, _amd, 0)
3340
3341#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3342EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3343#endif
3344EMIT_SHR(8, uint8_t, _intel, 1)
3345EMIT_SHR(8, uint8_t, _amd, 0)
3346
3347
3348/*
3349 * SAR
3350 */
3351#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3352IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3353{ \
3354 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3355 if (cShift) \
3356 { \
3357 a_iType const iDst = (a_iType)*puDst; \
3358 a_uType uResult = iDst >> cShift; \
3359 *puDst = uResult; \
3360 \
3361 /* Calc EFLAGS. \
3362 Note! The OF flag is always zero because the result never differs from the input. */ \
3363 AssertCompile(X86_EFL_CF_BIT == 0); \
3364 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3365 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3366 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3367 fEfl |= X86_EFL_CALC_ZF(uResult); \
3368 fEfl |= g_afParity[uResult & 0xff]; \
3369 if (!a_fIntelFlags) \
3370 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3371 *pfEFlags = fEfl; \
3372 } \
3373}
3374
3375#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3376EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3377#endif
3378EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3379EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3380
3381#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3382EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3383#endif
3384EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3385EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3386
3387#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3388EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3389#endif
3390EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3391EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3392
3393#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3394EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3395#endif
3396EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3397EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3398
3399
3400/*
3401 * SHLD
3402 *
3403 * - CF is the last bit shifted out of puDst.
3404 * - AF is always cleared by Intel 10980XE.
3405 * - AF is always set by AMD 3990X.
3406 * - OF is set according to the first shift on Intel 10980XE, it seems.
3407 * - OF is set according to the last sub-shift on AMD 3990X.
3408 * - ZF, SF and PF are calculated according to the result by both vendors.
3409 *
3410 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3411 * pick either the source register or the destination register for input bits
3412 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3413 * intel has changed behaviour here several times. We implement what current
3414 * skylake based does for now, we can extend this later as needed.
3415 */
3416#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3417IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3418 uint32_t *pfEFlags)) \
3419{ \
3420 cShift &= a_cBitsWidth - 1; \
3421 if (cShift) \
3422 { \
3423 a_uType const uDst = *puDst; \
3424 a_uType uResult = uDst << cShift; \
3425 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3426 *puDst = uResult; \
3427 \
3428 /* CALC EFLAGS: */ \
3429 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3430 if (a_fIntelFlags) \
3431 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3432 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3433 else \
3434 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3435 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3436 fEfl |= X86_EFL_AF; \
3437 } \
3438 AssertCompile(X86_EFL_CF_BIT == 0); \
3439 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3440 fEfl |= g_afParity[uResult & 0xff]; \
3441 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3442 fEfl |= X86_EFL_CALC_ZF(uResult); \
3443 *pfEFlags = fEfl; \
3444 } \
3445}
3446
3447#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3448EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3449#endif
3450EMIT_SHLD(64, uint64_t, _intel, 1)
3451EMIT_SHLD(64, uint64_t, _amd, 0)
3452
3453#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3454EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3455#endif
3456EMIT_SHLD(32, uint32_t, _intel, 1)
3457EMIT_SHLD(32, uint32_t, _amd, 0)
3458
3459#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3460IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3461{ \
3462 cShift &= 31; \
3463 if (cShift) \
3464 { \
3465 uint16_t const uDst = *puDst; \
3466 uint64_t const uTmp = a_fIntelFlags \
3467 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3468 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3469 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3470 *puDst = uResult; \
3471 \
3472 /* CALC EFLAGS: */ \
3473 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3474 AssertCompile(X86_EFL_CF_BIT == 0); \
3475 if (a_fIntelFlags) \
3476 { \
3477 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3478 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3479 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3480 } \
3481 else \
3482 { \
3483 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3484 if (cShift < 16) \
3485 { \
3486 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3487 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3488 } \
3489 else \
3490 { \
3491 if (cShift == 16) \
3492 fEfl |= uDst & X86_EFL_CF; \
3493 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3494 } \
3495 fEfl |= X86_EFL_AF; \
3496 } \
3497 fEfl |= g_afParity[uResult & 0xff]; \
3498 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3499 fEfl |= X86_EFL_CALC_ZF(uResult); \
3500 *pfEFlags = fEfl; \
3501 } \
3502}
3503
3504#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3505EMIT_SHLD_16(RT_NOTHING, 1)
3506#endif
3507EMIT_SHLD_16(_intel, 1)
3508EMIT_SHLD_16(_amd, 0)
3509
3510
3511/*
3512 * SHRD
3513 *
3514 * EFLAGS behaviour seems to be the same as with SHLD:
3515 * - CF is the last bit shifted out of puDst.
3516 * - AF is always cleared by Intel 10980XE.
3517 * - AF is always set by AMD 3990X.
3518 * - OF is set according to the first shift on Intel 10980XE, it seems.
3519 * - OF is set according to the last sub-shift on AMD 3990X.
3520 * - ZF, SF and PF are calculated according to the result by both vendors.
3521 *
3522 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3523 * pick either the source register or the destination register for input bits
3524 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3525 * intel has changed behaviour here several times. We implement what current
3526 * skylake based does for now, we can extend this later as needed.
3527 */
3528#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3529IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3530{ \
3531 cShift &= a_cBitsWidth - 1; \
3532 if (cShift) \
3533 { \
3534 a_uType const uDst = *puDst; \
3535 a_uType uResult = uDst >> cShift; \
3536 uResult |= uSrc << (a_cBitsWidth - cShift); \
3537 *puDst = uResult; \
3538 \
3539 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3540 AssertCompile(X86_EFL_CF_BIT == 0); \
3541 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3542 if (a_fIntelFlags) \
3543 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3544 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3545 else \
3546 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3547 if (cShift > 1) /* Set according to last shift. */ \
3548 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3549 else \
3550 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3551 fEfl |= X86_EFL_AF; \
3552 } \
3553 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3554 fEfl |= X86_EFL_CALC_ZF(uResult); \
3555 fEfl |= g_afParity[uResult & 0xff]; \
3556 *pfEFlags = fEfl; \
3557 } \
3558}
3559
3560#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3561EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3562#endif
3563EMIT_SHRD(64, uint64_t, _intel, 1)
3564EMIT_SHRD(64, uint64_t, _amd, 0)
3565
3566#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3567EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3568#endif
3569EMIT_SHRD(32, uint32_t, _intel, 1)
3570EMIT_SHRD(32, uint32_t, _amd, 0)
3571
3572#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3573IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3574{ \
3575 cShift &= 31; \
3576 if (cShift) \
3577 { \
3578 uint16_t const uDst = *puDst; \
3579 uint64_t const uTmp = a_fIntelFlags \
3580 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3581 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3582 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3583 *puDst = uResult; \
3584 \
3585 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3586 AssertCompile(X86_EFL_CF_BIT == 0); \
3587 if (a_fIntelFlags) \
3588 { \
3589 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3590 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3591 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3592 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3593 } \
3594 else \
3595 { \
3596 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3597 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3598 /* AMD 3990X: Set according to last shift. AF always set. */ \
3599 if (cShift > 1) /* Set according to last shift. */ \
3600 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3601 else \
3602 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3603 fEfl |= X86_EFL_AF; \
3604 } \
3605 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3606 fEfl |= X86_EFL_CALC_ZF(uResult); \
3607 fEfl |= g_afParity[uResult & 0xff]; \
3608 *pfEFlags = fEfl; \
3609 } \
3610}
3611
3612#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3613EMIT_SHRD_16(RT_NOTHING, 1)
3614#endif
3615EMIT_SHRD_16(_intel, 1)
3616EMIT_SHRD_16(_amd, 0)
3617
3618
3619/*
3620 * RORX (BMI2)
3621 */
3622#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3623IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3624{ \
3625 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3626}
3627
3628#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3629EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3630#endif
3631#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3632EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3633#endif
3634
3635
3636/*
3637 * SHLX (BMI2)
3638 */
3639#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3640IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3641{ \
3642 cShift &= a_cBitsWidth - 1; \
3643 *puDst = uSrc << cShift; \
3644}
3645
3646#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3647EMIT_SHLX(64, uint64_t, RT_NOTHING)
3648EMIT_SHLX(64, uint64_t, _fallback)
3649#endif
3650#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3651EMIT_SHLX(32, uint32_t, RT_NOTHING)
3652EMIT_SHLX(32, uint32_t, _fallback)
3653#endif
3654
3655
3656/*
3657 * SHRX (BMI2)
3658 */
3659#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3660IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3661{ \
3662 cShift &= a_cBitsWidth - 1; \
3663 *puDst = uSrc >> cShift; \
3664}
3665
3666#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3667EMIT_SHRX(64, uint64_t, RT_NOTHING)
3668EMIT_SHRX(64, uint64_t, _fallback)
3669#endif
3670#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3671EMIT_SHRX(32, uint32_t, RT_NOTHING)
3672EMIT_SHRX(32, uint32_t, _fallback)
3673#endif
3674
3675
3676/*
3677 * SARX (BMI2)
3678 */
3679#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3680IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3681{ \
3682 cShift &= a_cBitsWidth - 1; \
3683 *puDst = (a_iType)uSrc >> cShift; \
3684}
3685
3686#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3687EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3688EMIT_SARX(64, uint64_t, int64_t, _fallback)
3689#endif
3690#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3691EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3692EMIT_SARX(32, uint32_t, int32_t, _fallback)
3693#endif
3694
3695
3696/*
3697 * PDEP (BMI2)
3698 */
3699#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3700IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3701{ \
3702 a_uType uResult = 0; \
3703 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3704 if (fMask & ((a_uType)1 << iMaskBit)) \
3705 { \
3706 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3707 iBit++; \
3708 } \
3709 *puDst = uResult; \
3710}
3711
3712#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3713EMIT_PDEP(64, uint64_t, RT_NOTHING)
3714#endif
3715EMIT_PDEP(64, uint64_t, _fallback)
3716#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3717EMIT_PDEP(32, uint32_t, RT_NOTHING)
3718#endif
3719EMIT_PDEP(32, uint32_t, _fallback)
3720
3721/*
3722 * PEXT (BMI2)
3723 */
3724#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3725IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3726{ \
3727 a_uType uResult = 0; \
3728 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3729 if (fMask & ((a_uType)1 << iMaskBit)) \
3730 { \
3731 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3732 iBit++; \
3733 } \
3734 *puDst = uResult; \
3735}
3736
3737#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3738EMIT_PEXT(64, uint64_t, RT_NOTHING)
3739#endif
3740EMIT_PEXT(64, uint64_t, _fallback)
3741#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3742EMIT_PEXT(32, uint32_t, RT_NOTHING)
3743#endif
3744EMIT_PEXT(32, uint32_t, _fallback)
3745
3746
3747#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3748
3749# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3750/*
3751 * BSWAP
3752 */
3753
3754IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3755{
3756 *puDst = ASMByteSwapU64(*puDst);
3757}
3758
3759
3760IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3761{
3762 *puDst = ASMByteSwapU32(*puDst);
3763}
3764
3765
3766/* Note! undocument, so 32-bit arg */
3767IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3768{
3769#if 0
3770 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3771#else
3772 /* This is the behaviour AMD 3990x (64-bit mode): */
3773 *(uint16_t *)puDst = 0;
3774#endif
3775}
3776
3777# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3778
3779
3780
3781# if defined(IEM_WITHOUT_ASSEMBLY)
3782
3783/*
3784 * LFENCE, SFENCE & MFENCE.
3785 */
3786
3787IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3788{
3789 ASMReadFence();
3790}
3791
3792
3793IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3794{
3795 ASMWriteFence();
3796}
3797
3798
3799IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3800{
3801 ASMMemoryFence();
3802}
3803
3804
3805# ifndef RT_ARCH_ARM64
3806IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3807{
3808 ASMMemoryFence();
3809}
3810# endif
3811
3812# endif
3813
3814#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3815
3816
3817IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3818{
3819 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3820 {
3821 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3822 *pu16Dst |= u16Src & X86_SEL_RPL;
3823
3824 *pfEFlags |= X86_EFL_ZF;
3825 }
3826 else
3827 *pfEFlags &= ~X86_EFL_ZF;
3828}
3829
3830
3831#if defined(IEM_WITHOUT_ASSEMBLY)
3832
3833/*********************************************************************************************************************************
3834* x87 FPU Loads *
3835*********************************************************************************************************************************/
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3838{
3839 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3840 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3841 {
3842 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3843 pFpuRes->r80Result.sj64.fInteger = 1;
3844 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3845 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3846 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3847 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3848 }
3849 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3850 {
3851 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3852 pFpuRes->r80Result.s.uExponent = 0;
3853 pFpuRes->r80Result.s.uMantissa = 0;
3854 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3855 }
3856 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3857 {
3858 /* Subnormal values gets normalized. */
3859 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3860 pFpuRes->r80Result.sj64.fInteger = 1;
3861 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3862 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3863 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3864 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3865 pFpuRes->FSW |= X86_FSW_DE;
3866 if (!(pFpuState->FCW & X86_FCW_DM))
3867 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3868 }
3869 else if (RTFLOAT32U_IS_INF(pr32Val))
3870 {
3871 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3872 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3873 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3874 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3875 }
3876 else
3877 {
3878 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3879 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3880 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3881 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3882 pFpuRes->r80Result.sj64.fInteger = 1;
3883 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3884 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3885 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3886 {
3887 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3888 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3889 pFpuRes->FSW |= X86_FSW_IE;
3890
3891 if (!(pFpuState->FCW & X86_FCW_IM))
3892 {
3893 /* The value is not pushed. */
3894 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3895 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3896 pFpuRes->r80Result.au64[0] = 0;
3897 pFpuRes->r80Result.au16[4] = 0;
3898 }
3899 }
3900 else
3901 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3902 }
3903}
3904
3905
3906IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3907{
3908 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3909 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3910 {
3911 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3912 pFpuRes->r80Result.sj64.fInteger = 1;
3913 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3914 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3915 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3916 }
3917 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3918 {
3919 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3920 pFpuRes->r80Result.s.uExponent = 0;
3921 pFpuRes->r80Result.s.uMantissa = 0;
3922 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3923 }
3924 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3925 {
3926 /* Subnormal values gets normalized. */
3927 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3928 pFpuRes->r80Result.sj64.fInteger = 1;
3929 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3930 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3931 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3932 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3933 pFpuRes->FSW |= X86_FSW_DE;
3934 if (!(pFpuState->FCW & X86_FCW_DM))
3935 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3936 }
3937 else if (RTFLOAT64U_IS_INF(pr64Val))
3938 {
3939 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3940 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3941 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3942 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3943 }
3944 else
3945 {
3946 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3947 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3948 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3949 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3950 pFpuRes->r80Result.sj64.fInteger = 1;
3951 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3952 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3953 {
3954 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3955 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3956 pFpuRes->FSW |= X86_FSW_IE;
3957
3958 if (!(pFpuState->FCW & X86_FCW_IM))
3959 {
3960 /* The value is not pushed. */
3961 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3962 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3963 pFpuRes->r80Result.au64[0] = 0;
3964 pFpuRes->r80Result.au16[4] = 0;
3965 }
3966 }
3967 else
3968 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3969 }
3970}
3971
3972
3973IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3974{
3975 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3976 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3977 /* Raises no exceptions. */
3978 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3979}
3980
3981
3982IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3983{
3984 pFpuRes->r80Result.sj64.fSign = 0;
3985 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3986 pFpuRes->r80Result.sj64.fInteger = 1;
3987 pFpuRes->r80Result.sj64.uFraction = 0;
3988
3989 /*
3990 * FPU status word:
3991 * - TOP is irrelevant, but we must match x86 assembly version.
3992 * - C1 is always cleared as we don't have any stack overflows.
3993 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3994 */
3995 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3996}
3997
3998
3999IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4000{
4001 pFpuRes->r80Result.sj64.fSign = 0;
4002 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4003 pFpuRes->r80Result.sj64.fInteger = 1;
4004 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4005 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4006 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4007 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4008}
4009
4010
4011IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4012{
4013 pFpuRes->r80Result.sj64.fSign = 0;
4014 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4015 pFpuRes->r80Result.sj64.fInteger = 1;
4016 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4017 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4018 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4019}
4020
4021
4022IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4023{
4024 pFpuRes->r80Result.sj64.fSign = 0;
4025 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4026 pFpuRes->r80Result.sj64.fInteger = 1;
4027 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4028 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4029 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4030 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4031}
4032
4033
4034IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4035{
4036 pFpuRes->r80Result.sj64.fSign = 0;
4037 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4038 pFpuRes->r80Result.sj64.fInteger = 1;
4039 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4040 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4041 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4042 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4043}
4044
4045
4046IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4047{
4048 pFpuRes->r80Result.sj64.fSign = 0;
4049 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4050 pFpuRes->r80Result.sj64.fInteger = 1;
4051 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4052 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4053 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4054 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4055}
4056
4057
4058IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4059{
4060 pFpuRes->r80Result.s.fSign = 0;
4061 pFpuRes->r80Result.s.uExponent = 0;
4062 pFpuRes->r80Result.s.uMantissa = 0;
4063 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4064}
4065
4066#define EMIT_FILD(a_cBits) \
4067IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4068 int ## a_cBits ## _t const *piVal)) \
4069{ \
4070 int ## a_cBits ## _t iVal = *piVal; \
4071 if (iVal == 0) \
4072 { \
4073 pFpuRes->r80Result.s.fSign = 0; \
4074 pFpuRes->r80Result.s.uExponent = 0; \
4075 pFpuRes->r80Result.s.uMantissa = 0; \
4076 } \
4077 else \
4078 { \
4079 if (iVal > 0) \
4080 pFpuRes->r80Result.s.fSign = 0; \
4081 else \
4082 { \
4083 pFpuRes->r80Result.s.fSign = 1; \
4084 iVal = -iVal; \
4085 } \
4086 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4087 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4088 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4089 } \
4090 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4091}
4092EMIT_FILD(16)
4093EMIT_FILD(32)
4094EMIT_FILD(64)
4095
4096
4097IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4098{
4099 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4100 if ( pd80Val->s.abPairs[0] == 0
4101 && pd80Val->s.abPairs[1] == 0
4102 && pd80Val->s.abPairs[2] == 0
4103 && pd80Val->s.abPairs[3] == 0
4104 && pd80Val->s.abPairs[4] == 0
4105 && pd80Val->s.abPairs[5] == 0
4106 && pd80Val->s.abPairs[6] == 0
4107 && pd80Val->s.abPairs[7] == 0
4108 && pd80Val->s.abPairs[8] == 0)
4109 {
4110 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4111 pFpuRes->r80Result.s.uExponent = 0;
4112 pFpuRes->r80Result.s.uMantissa = 0;
4113 }
4114 else
4115 {
4116 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4117
4118 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4119 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4120 cPairs--;
4121
4122 uint64_t uVal = 0;
4123 uint64_t uFactor = 1;
4124 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4125 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4126 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4127
4128 unsigned const cBits = ASMBitLastSetU64(uVal);
4129 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4130 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4131 }
4132}
4133
4134
4135/*********************************************************************************************************************************
4136* x87 FPU Stores *
4137*********************************************************************************************************************************/
4138
4139/**
4140 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4141 *
4142 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4143 *
4144 * @returns Updated FPU status word value.
4145 * @param fSignIn Incoming sign indicator.
4146 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4147 * @param iExponentIn Unbiased exponent.
4148 * @param fFcw The FPU control word.
4149 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4150 * @param pr32Dst Where to return the output value, if one should be
4151 * returned.
4152 *
4153 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4154 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4155 */
4156static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4157 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4158{
4159 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4160 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4161 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4162 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4163 ? fRoundingOffMask
4164 : 0;
4165 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4166
4167 /*
4168 * Deal with potential overflows/underflows first, optimizing for none.
4169 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4170 */
4171 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4172 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4173 { /* likely? */ }
4174 /*
4175 * Underflow if the exponent zero or negative. This is attempted mapped
4176 * to a subnormal number when possible, with some additional trickery ofc.
4177 */
4178 else if (iExponentOut <= 0)
4179 {
4180 bool const fIsTiny = iExponentOut < 0
4181 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4182 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4183 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4184 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4185
4186 if (iExponentOut <= 0)
4187 {
4188 uMantissaIn = iExponentOut <= -63
4189 ? uMantissaIn != 0
4190 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4191 fRoundedOff = uMantissaIn & fRoundingOffMask;
4192 if (fRoundedOff && fIsTiny)
4193 fFsw |= X86_FSW_UE;
4194 iExponentOut = 0;
4195 }
4196 }
4197 /*
4198 * Overflow if at or above max exponent value or if we will reach max
4199 * when rounding. Will return +/-zero or +/-max value depending on
4200 * whether we're rounding or not.
4201 */
4202 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4203 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4204 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4205 {
4206 fFsw |= X86_FSW_OE;
4207 if (!(fFcw & X86_FCW_OM))
4208 return fFsw | X86_FSW_ES | X86_FSW_B;
4209 fFsw |= X86_FSW_PE;
4210 if (uRoundingAdd)
4211 fFsw |= X86_FSW_C1;
4212 if (!(fFcw & X86_FCW_PM))
4213 fFsw |= X86_FSW_ES | X86_FSW_B;
4214
4215 pr32Dst->s.fSign = fSignIn;
4216 if (uRoundingAdd)
4217 { /* Zero */
4218 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4219 pr32Dst->s.uFraction = 0;
4220 }
4221 else
4222 { /* Max */
4223 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4224 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4225 }
4226 return fFsw;
4227 }
4228
4229 /*
4230 * Normal or subnormal number.
4231 */
4232 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4233 uint64_t uMantissaOut = uMantissaIn;
4234 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4235 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4236 || fRoundedOff != uRoundingAdd)
4237 {
4238 uMantissaOut = uMantissaIn + uRoundingAdd;
4239 if (uMantissaOut >= uMantissaIn)
4240 { /* likely */ }
4241 else
4242 {
4243 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4244 iExponentOut++;
4245 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4246 fFsw |= X86_FSW_C1;
4247 }
4248 }
4249 else
4250 uMantissaOut = uMantissaIn;
4251
4252 /* Truncate the mantissa and set the return value. */
4253 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4254
4255 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4256 pr32Dst->s.uExponent = iExponentOut;
4257 pr32Dst->s.fSign = fSignIn;
4258
4259 /* Set status flags realted to rounding. */
4260 if (fRoundedOff)
4261 {
4262 fFsw |= X86_FSW_PE;
4263 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4264 fFsw |= X86_FSW_C1;
4265 if (!(fFcw & X86_FCW_PM))
4266 fFsw |= X86_FSW_ES | X86_FSW_B;
4267 }
4268
4269 return fFsw;
4270}
4271
4272
4273/**
4274 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4275 */
4276IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4277 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4278{
4279 uint16_t const fFcw = pFpuState->FCW;
4280 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4281 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4282 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4283 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4284 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4285 {
4286 pr32Dst->s.fSign = pr80Src->s.fSign;
4287 pr32Dst->s.uExponent = 0;
4288 pr32Dst->s.uFraction = 0;
4289 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4290 }
4291 else if (RTFLOAT80U_IS_INF(pr80Src))
4292 {
4293 pr32Dst->s.fSign = pr80Src->s.fSign;
4294 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4295 pr32Dst->s.uFraction = 0;
4296 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4297 }
4298 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4299 {
4300 /* Mapped to +/-QNaN */
4301 pr32Dst->s.fSign = pr80Src->s.fSign;
4302 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4303 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4304 }
4305 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4306 {
4307 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4308 if (fFcw & X86_FCW_IM)
4309 {
4310 pr32Dst->s.fSign = 1;
4311 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4312 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4313 fFsw |= X86_FSW_IE;
4314 }
4315 else
4316 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4317 }
4318 else if (RTFLOAT80U_IS_NAN(pr80Src))
4319 {
4320 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4321 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4322 {
4323 pr32Dst->s.fSign = pr80Src->s.fSign;
4324 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4325 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4326 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4327 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4328 fFsw |= X86_FSW_IE;
4329 }
4330 else
4331 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4332 }
4333 else
4334 {
4335 /* Denormal values causes both an underflow and precision exception. */
4336 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4337 if (fFcw & X86_FCW_UM)
4338 {
4339 pr32Dst->s.fSign = pr80Src->s.fSign;
4340 pr32Dst->s.uExponent = 0;
4341 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4342 {
4343 pr32Dst->s.uFraction = 1;
4344 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4345 if (!(fFcw & X86_FCW_PM))
4346 fFsw |= X86_FSW_ES | X86_FSW_B;
4347 }
4348 else
4349 {
4350 pr32Dst->s.uFraction = 0;
4351 fFsw |= X86_FSW_UE | X86_FSW_PE;
4352 if (!(fFcw & X86_FCW_PM))
4353 fFsw |= X86_FSW_ES | X86_FSW_B;
4354 }
4355 }
4356 else
4357 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4358 }
4359 *pu16FSW = fFsw;
4360}
4361
4362
4363/**
4364 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4365 *
4366 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4367 *
4368 * @returns Updated FPU status word value.
4369 * @param fSignIn Incoming sign indicator.
4370 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4371 * @param iExponentIn Unbiased exponent.
4372 * @param fFcw The FPU control word.
4373 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4374 * @param pr64Dst Where to return the output value, if one should be
4375 * returned.
4376 *
4377 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4378 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4379 */
4380static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4381 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4382{
4383 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4384 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4385 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4386 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4387 ? fRoundingOffMask
4388 : 0;
4389 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4390
4391 /*
4392 * Deal with potential overflows/underflows first, optimizing for none.
4393 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4394 */
4395 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4396 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4397 { /* likely? */ }
4398 /*
4399 * Underflow if the exponent zero or negative. This is attempted mapped
4400 * to a subnormal number when possible, with some additional trickery ofc.
4401 */
4402 else if (iExponentOut <= 0)
4403 {
4404 bool const fIsTiny = iExponentOut < 0
4405 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4406 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4407 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4408 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4409
4410 if (iExponentOut <= 0)
4411 {
4412 uMantissaIn = iExponentOut <= -63
4413 ? uMantissaIn != 0
4414 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4415 fRoundedOff = uMantissaIn & fRoundingOffMask;
4416 if (fRoundedOff && fIsTiny)
4417 fFsw |= X86_FSW_UE;
4418 iExponentOut = 0;
4419 }
4420 }
4421 /*
4422 * Overflow if at or above max exponent value or if we will reach max
4423 * when rounding. Will return +/-zero or +/-max value depending on
4424 * whether we're rounding or not.
4425 */
4426 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4427 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4428 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4429 {
4430 fFsw |= X86_FSW_OE;
4431 if (!(fFcw & X86_FCW_OM))
4432 return fFsw | X86_FSW_ES | X86_FSW_B;
4433 fFsw |= X86_FSW_PE;
4434 if (uRoundingAdd)
4435 fFsw |= X86_FSW_C1;
4436 if (!(fFcw & X86_FCW_PM))
4437 fFsw |= X86_FSW_ES | X86_FSW_B;
4438
4439 pr64Dst->s64.fSign = fSignIn;
4440 if (uRoundingAdd)
4441 { /* Zero */
4442 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4443 pr64Dst->s64.uFraction = 0;
4444 }
4445 else
4446 { /* Max */
4447 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4448 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4449 }
4450 return fFsw;
4451 }
4452
4453 /*
4454 * Normal or subnormal number.
4455 */
4456 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4457 uint64_t uMantissaOut = uMantissaIn;
4458 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4459 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4460 || fRoundedOff != uRoundingAdd)
4461 {
4462 uMantissaOut = uMantissaIn + uRoundingAdd;
4463 if (uMantissaOut >= uMantissaIn)
4464 { /* likely */ }
4465 else
4466 {
4467 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4468 iExponentOut++;
4469 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4470 fFsw |= X86_FSW_C1;
4471 }
4472 }
4473 else
4474 uMantissaOut = uMantissaIn;
4475
4476 /* Truncate the mantissa and set the return value. */
4477 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4478
4479 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4480 pr64Dst->s64.uExponent = iExponentOut;
4481 pr64Dst->s64.fSign = fSignIn;
4482
4483 /* Set status flags realted to rounding. */
4484 if (fRoundedOff)
4485 {
4486 fFsw |= X86_FSW_PE;
4487 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4488 fFsw |= X86_FSW_C1;
4489 if (!(fFcw & X86_FCW_PM))
4490 fFsw |= X86_FSW_ES | X86_FSW_B;
4491 }
4492
4493 return fFsw;
4494}
4495
4496
4497/**
4498 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4499 */
4500IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4501 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4502{
4503 uint16_t const fFcw = pFpuState->FCW;
4504 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4505 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4506 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4507 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4508 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4509 {
4510 pr64Dst->s64.fSign = pr80Src->s.fSign;
4511 pr64Dst->s64.uExponent = 0;
4512 pr64Dst->s64.uFraction = 0;
4513 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4514 }
4515 else if (RTFLOAT80U_IS_INF(pr80Src))
4516 {
4517 pr64Dst->s64.fSign = pr80Src->s.fSign;
4518 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4519 pr64Dst->s64.uFraction = 0;
4520 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4521 }
4522 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4523 {
4524 /* Mapped to +/-QNaN */
4525 pr64Dst->s64.fSign = pr80Src->s.fSign;
4526 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4527 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4528 }
4529 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4530 {
4531 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4532 if (fFcw & X86_FCW_IM)
4533 {
4534 pr64Dst->s64.fSign = 1;
4535 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4536 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4537 fFsw |= X86_FSW_IE;
4538 }
4539 else
4540 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4541 }
4542 else if (RTFLOAT80U_IS_NAN(pr80Src))
4543 {
4544 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4545 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4546 {
4547 pr64Dst->s64.fSign = pr80Src->s.fSign;
4548 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4549 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4550 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4551 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4552 fFsw |= X86_FSW_IE;
4553 }
4554 else
4555 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4556 }
4557 else
4558 {
4559 /* Denormal values causes both an underflow and precision exception. */
4560 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4561 if (fFcw & X86_FCW_UM)
4562 {
4563 pr64Dst->s64.fSign = pr80Src->s.fSign;
4564 pr64Dst->s64.uExponent = 0;
4565 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4566 {
4567 pr64Dst->s64.uFraction = 1;
4568 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4569 if (!(fFcw & X86_FCW_PM))
4570 fFsw |= X86_FSW_ES | X86_FSW_B;
4571 }
4572 else
4573 {
4574 pr64Dst->s64.uFraction = 0;
4575 fFsw |= X86_FSW_UE | X86_FSW_PE;
4576 if (!(fFcw & X86_FCW_PM))
4577 fFsw |= X86_FSW_ES | X86_FSW_B;
4578 }
4579 }
4580 else
4581 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4582 }
4583 *pu16FSW = fFsw;
4584}
4585
4586
4587IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4588 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4589{
4590 /*
4591 * FPU status word:
4592 * - TOP is irrelevant, but we must match x86 assembly version (0).
4593 * - C1 is always cleared as we don't have any stack overflows.
4594 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4595 */
4596 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4597 *pr80Dst = *pr80Src;
4598}
4599
4600
4601/*
4602 *
4603 * Mantissa:
4604 * 63 56 48 40 32 24 16 8 0
4605 * v v v v v v v v v
4606 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4607 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4608 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4609 *
4610 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4611 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4612 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4613 * where we'll drop off all but bit 63.
4614 */
4615#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4616IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4617 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4618{ \
4619 uint16_t const fFcw = pFpuState->FCW; \
4620 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4621 bool const fSignIn = pr80Val->s.fSign; \
4622 \
4623 /* \
4624 * Deal with normal numbers first. \
4625 */ \
4626 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4627 { \
4628 uint64_t uMantissa = pr80Val->s.uMantissa; \
4629 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4630 \
4631 if ((uint32_t)iExponent <= a_cBits - 2) \
4632 { \
4633 unsigned const cShiftOff = 63 - iExponent; \
4634 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4635 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4636 ? RT_BIT_64(cShiftOff - 1) \
4637 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4638 ? fRoundingOffMask \
4639 : 0; \
4640 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4641 \
4642 uMantissa >>= cShiftOff; \
4643 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4644 uMantissa += uRounding; \
4645 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4646 { \
4647 if (fRoundedOff) \
4648 { \
4649 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4650 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4651 else if (uRounding) \
4652 fFsw |= X86_FSW_C1; \
4653 fFsw |= X86_FSW_PE; \
4654 if (!(fFcw & X86_FCW_PM)) \
4655 fFsw |= X86_FSW_ES | X86_FSW_B; \
4656 } \
4657 \
4658 if (!fSignIn) \
4659 *piDst = (a_iType)uMantissa; \
4660 else \
4661 *piDst = -(a_iType)uMantissa; \
4662 } \
4663 else \
4664 { \
4665 /* overflowed after rounding. */ \
4666 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4667 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4668 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4669 \
4670 /* Special case for the integer minimum value. */ \
4671 if (fSignIn) \
4672 { \
4673 *piDst = a_iTypeMin; \
4674 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4675 if (!(fFcw & X86_FCW_PM)) \
4676 fFsw |= X86_FSW_ES | X86_FSW_B; \
4677 } \
4678 else \
4679 { \
4680 fFsw |= X86_FSW_IE; \
4681 if (fFcw & X86_FCW_IM) \
4682 *piDst = a_iTypeMin; \
4683 else \
4684 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4685 } \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 if (!fSignIn) \
4694 { \
4695 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4696 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4697 { \
4698 *piDst = 1; \
4699 fFsw |= X86_FSW_C1; \
4700 } \
4701 else \
4702 *piDst = 0; \
4703 } \
4704 else \
4705 { \
4706 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4707 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4708 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4709 *piDst = 0; \
4710 else \
4711 { \
4712 *piDst = -1; \
4713 fFsw |= X86_FSW_C1; \
4714 } \
4715 } \
4716 fFsw |= X86_FSW_PE; \
4717 if (!(fFcw & X86_FCW_PM)) \
4718 fFsw |= X86_FSW_ES | X86_FSW_B; \
4719 } \
4720 /* \
4721 * Special MIN case. \
4722 */ \
4723 else if ( fSignIn && iExponent == a_cBits - 1 \
4724 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4725 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4726 : uMantissa == RT_BIT_64(63))) \
4727 { \
4728 *piDst = a_iTypeMin; \
4729 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4730 { \
4731 fFsw |= X86_FSW_PE; \
4732 if (!(fFcw & X86_FCW_PM)) \
4733 fFsw |= X86_FSW_ES | X86_FSW_B; \
4734 } \
4735 } \
4736 /* \
4737 * Too large/small number outside the target integer range. \
4738 */ \
4739 else \
4740 { \
4741 fFsw |= X86_FSW_IE; \
4742 if (fFcw & X86_FCW_IM) \
4743 *piDst = a_iTypeIndefinite; \
4744 else \
4745 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4746 } \
4747 } \
4748 /* \
4749 * Map both +0 and -0 to integer zero (signless/+). \
4750 */ \
4751 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4752 *piDst = 0; \
4753 /* \
4754 * Denormals are just really tiny sub-zero numbers that are either rounded \
4755 * to zero, 1 or -1 depending on sign and rounding control. \
4756 */ \
4757 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4758 { \
4759 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4760 *piDst = 0; \
4761 else \
4762 { \
4763 *piDst = fSignIn ? -1 : 1; \
4764 fFsw |= X86_FSW_C1; \
4765 } \
4766 fFsw |= X86_FSW_PE; \
4767 if (!(fFcw & X86_FCW_PM)) \
4768 fFsw |= X86_FSW_ES | X86_FSW_B; \
4769 } \
4770 /* \
4771 * All other special values are considered invalid arguments and result \
4772 * in an IE exception and indefinite value if masked. \
4773 */ \
4774 else \
4775 { \
4776 fFsw |= X86_FSW_IE; \
4777 if (fFcw & X86_FCW_IM) \
4778 *piDst = a_iTypeIndefinite; \
4779 else \
4780 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4781 } \
4782 *pu16FSW = fFsw; \
4783}
4784EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4785EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4786EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4787
4788#endif /*IEM_WITHOUT_ASSEMBLY */
4789
4790
4791/*
4792 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4793 *
4794 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4795 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4796 * thus the @a a_cBitsIn.
4797 */
4798#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4799IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4800 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4801{ \
4802 uint16_t const fFcw = pFpuState->FCW; \
4803 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4804 bool const fSignIn = pr80Val->s.fSign; \
4805 \
4806 /* \
4807 * Deal with normal numbers first. \
4808 */ \
4809 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4810 { \
4811 uint64_t uMantissa = pr80Val->s.uMantissa; \
4812 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4813 \
4814 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4815 { \
4816 unsigned const cShiftOff = 63 - iExponent; \
4817 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4818 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4819 uMantissa >>= cShiftOff; \
4820 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4821 if (!fSignIn) \
4822 *piDst = (a_iType)uMantissa; \
4823 else \
4824 *piDst = -(a_iType)uMantissa; \
4825 \
4826 if (fRoundedOff) \
4827 { \
4828 fFsw |= X86_FSW_PE; \
4829 if (!(fFcw & X86_FCW_PM)) \
4830 fFsw |= X86_FSW_ES | X86_FSW_B; \
4831 } \
4832 } \
4833 /* \
4834 * Tiny sub-zero numbers. \
4835 */ \
4836 else if (iExponent < 0) \
4837 { \
4838 *piDst = 0; \
4839 fFsw |= X86_FSW_PE; \
4840 if (!(fFcw & X86_FCW_PM)) \
4841 fFsw |= X86_FSW_ES | X86_FSW_B; \
4842 } \
4843 /* \
4844 * Special MIN case. \
4845 */ \
4846 else if ( fSignIn && iExponent == a_cBits - 1 \
4847 && (a_cBits < 64 \
4848 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4849 : uMantissa == RT_BIT_64(63)) ) \
4850 { \
4851 *piDst = a_iTypeMin; \
4852 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4853 { \
4854 fFsw |= X86_FSW_PE; \
4855 if (!(fFcw & X86_FCW_PM)) \
4856 fFsw |= X86_FSW_ES | X86_FSW_B; \
4857 } \
4858 } \
4859 /* \
4860 * Figure this weirdness. \
4861 */ \
4862 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4863 { \
4864 *piDst = 0; \
4865 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4866 { \
4867 fFsw |= X86_FSW_PE; \
4868 if (!(fFcw & X86_FCW_PM)) \
4869 fFsw |= X86_FSW_ES | X86_FSW_B; \
4870 } \
4871 } \
4872 /* \
4873 * Too large/small number outside the target integer range. \
4874 */ \
4875 else \
4876 { \
4877 fFsw |= X86_FSW_IE; \
4878 if (fFcw & X86_FCW_IM) \
4879 *piDst = a_iTypeIndefinite; \
4880 else \
4881 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4882 } \
4883 } \
4884 /* \
4885 * Map both +0 and -0 to integer zero (signless/+). \
4886 */ \
4887 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4888 *piDst = 0; \
4889 /* \
4890 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4891 */ \
4892 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4893 { \
4894 *piDst = 0; \
4895 fFsw |= X86_FSW_PE; \
4896 if (!(fFcw & X86_FCW_PM)) \
4897 fFsw |= X86_FSW_ES | X86_FSW_B; \
4898 } \
4899 /* \
4900 * All other special values are considered invalid arguments and result \
4901 * in an IE exception and indefinite value if masked. \
4902 */ \
4903 else \
4904 { \
4905 fFsw |= X86_FSW_IE; \
4906 if (fFcw & X86_FCW_IM) \
4907 *piDst = a_iTypeIndefinite; \
4908 else \
4909 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4910 } \
4911 *pu16FSW = fFsw; \
4912}
4913#if defined(IEM_WITHOUT_ASSEMBLY)
4914EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4915EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4916EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4917#endif
4918EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4919EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4920
4921
4922#if defined(IEM_WITHOUT_ASSEMBLY)
4923
4924IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4925 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4926{
4927 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4928 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4929 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4930 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4931 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4932
4933 uint16_t const fFcw = pFpuState->FCW;
4934 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4935 bool const fSignIn = pr80Src->s.fSign;
4936
4937 /*
4938 * Deal with normal numbers first.
4939 */
4940 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4941 {
4942 uint64_t uMantissa = pr80Src->s.uMantissa;
4943 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4944 if ( (uint32_t)iExponent <= 58
4945 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4946 {
4947 unsigned const cShiftOff = 63 - iExponent;
4948 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4949 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4950 ? RT_BIT_64(cShiftOff - 1)
4951 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4952 ? fRoundingOffMask
4953 : 0;
4954 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4955
4956 uMantissa >>= cShiftOff;
4957 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4958 uMantissa += uRounding;
4959 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4960 {
4961 if (fRoundedOff)
4962 {
4963 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4964 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4965 else if (uRounding)
4966 fFsw |= X86_FSW_C1;
4967 fFsw |= X86_FSW_PE;
4968 if (!(fFcw & X86_FCW_PM))
4969 fFsw |= X86_FSW_ES | X86_FSW_B;
4970 }
4971
4972 pd80Dst->s.fSign = fSignIn;
4973 pd80Dst->s.uPad = 0;
4974 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4975 {
4976 unsigned const uDigits = uMantissa % 100;
4977 uMantissa /= 100;
4978 uint8_t const bLo = uDigits % 10;
4979 uint8_t const bHi = uDigits / 10;
4980 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4981 }
4982 }
4983 else
4984 {
4985 /* overflowed after rounding. */
4986 fFsw |= X86_FSW_IE;
4987 if (fFcw & X86_FCW_IM)
4988 *pd80Dst = s_d80Indefinite;
4989 else
4990 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4991 }
4992 }
4993 /*
4994 * Tiny sub-zero numbers.
4995 */
4996 else if (iExponent < 0)
4997 {
4998 if (!fSignIn)
4999 {
5000 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5001 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5002 {
5003 *pd80Dst = s_ad80One[fSignIn];
5004 fFsw |= X86_FSW_C1;
5005 }
5006 else
5007 *pd80Dst = s_ad80Zeros[fSignIn];
5008 }
5009 else
5010 {
5011 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5012 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5013 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5014 *pd80Dst = s_ad80Zeros[fSignIn];
5015 else
5016 {
5017 *pd80Dst = s_ad80One[fSignIn];
5018 fFsw |= X86_FSW_C1;
5019 }
5020 }
5021 fFsw |= X86_FSW_PE;
5022 if (!(fFcw & X86_FCW_PM))
5023 fFsw |= X86_FSW_ES | X86_FSW_B;
5024 }
5025 /*
5026 * Too large/small number outside the target integer range.
5027 */
5028 else
5029 {
5030 fFsw |= X86_FSW_IE;
5031 if (fFcw & X86_FCW_IM)
5032 *pd80Dst = s_d80Indefinite;
5033 else
5034 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5035 }
5036 }
5037 /*
5038 * Map both +0 and -0 to integer zero (signless/+).
5039 */
5040 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5041 *pd80Dst = s_ad80Zeros[fSignIn];
5042 /*
5043 * Denormals are just really tiny sub-zero numbers that are either rounded
5044 * to zero, 1 or -1 depending on sign and rounding control.
5045 */
5046 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5047 {
5048 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5049 *pd80Dst = s_ad80Zeros[fSignIn];
5050 else
5051 {
5052 *pd80Dst = s_ad80One[fSignIn];
5053 fFsw |= X86_FSW_C1;
5054 }
5055 fFsw |= X86_FSW_PE;
5056 if (!(fFcw & X86_FCW_PM))
5057 fFsw |= X86_FSW_ES | X86_FSW_B;
5058 }
5059 /*
5060 * All other special values are considered invalid arguments and result
5061 * in an IE exception and indefinite value if masked.
5062 */
5063 else
5064 {
5065 fFsw |= X86_FSW_IE;
5066 if (fFcw & X86_FCW_IM)
5067 *pd80Dst = s_d80Indefinite;
5068 else
5069 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5070 }
5071 *pu16FSW = fFsw;
5072}
5073
5074
5075/*********************************************************************************************************************************
5076* FPU Helpers *
5077*********************************************************************************************************************************/
5078AssertCompileSize(RTFLOAT128U, 16);
5079AssertCompileSize(RTFLOAT80U, 10);
5080AssertCompileSize(RTFLOAT64U, 8);
5081AssertCompileSize(RTFLOAT32U, 4);
5082
5083/**
5084 * Normalizes a possible pseudo-normal value.
5085 *
5086 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5087 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5088 * i.e. changing uExponent from 0 to 1.
5089 *
5090 * This macro will declare a RTFLOAT80U with the name given by
5091 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5092 * a normalization was performed.
5093 *
5094 * @note This must be applied before calling SoftFloat with a value that couldbe
5095 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5096 * correctly.
5097 */
5098#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5099 RTFLOAT80U a_r80ValNormalized; \
5100 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5101 { \
5102 a_r80ValNormalized = *a_pr80Val; \
5103 a_r80ValNormalized.s.uExponent = 1; \
5104 a_pr80Val = &a_r80ValNormalized; \
5105 } else do {} while (0)
5106
5107#ifdef IEM_WITH_FLOAT128_FOR_FPU
5108
5109DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5110{
5111 int fNew;
5112 switch (fFcw & X86_FCW_RC_MASK)
5113 {
5114 default:
5115 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5116 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5117 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5118 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5119 }
5120 int fOld = fegetround();
5121 fesetround(fNew);
5122 return fOld;
5123}
5124
5125
5126DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5127{
5128 fesetround(fOld);
5129}
5130
5131DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5132{
5133 RT_NOREF(fFcw);
5134 RTFLOAT128U Tmp;
5135 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5136 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5137 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5138 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5139 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5140 {
5141 Assert(Tmp.s.uExponent == 0);
5142 Tmp.s2.uSignAndExponent++;
5143 }
5144 return *(_Float128 *)&Tmp;
5145}
5146
5147
5148DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5149{
5150 RT_NOREF(fFcw);
5151 RTFLOAT128U Tmp;
5152 *(_Float128 *)&Tmp = rd128ValSrc;
5153 ASMCompilerBarrier();
5154 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5155 {
5156 pr80Dst->s.fSign = Tmp.s64.fSign;
5157 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5158 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5159 | Tmp.s64.uFractionLo >> (64 - 15);
5160
5161 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5162 unsigned const cShiftOff = 64 - 15;
5163 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5164 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5165 if (uRoundedOff)
5166 {
5167 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5168 ? RT_BIT_64(cShiftOff - 1)
5169 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5170 ? fRoundingOffMask
5171 : 0;
5172 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5173 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5174 || uRoundedOff != uRoundingAdd)
5175 {
5176 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5177 {
5178 uFraction += 1;
5179 if (!(uFraction & RT_BIT_64(63)))
5180 { /* likely */ }
5181 else
5182 {
5183 uFraction >>= 1;
5184 pr80Dst->s.uExponent++;
5185 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5186 return fFsw;
5187 }
5188 fFsw |= X86_FSW_C1;
5189 }
5190 }
5191 fFsw |= X86_FSW_PE;
5192 if (!(fFcw & X86_FCW_PM))
5193 fFsw |= X86_FSW_ES | X86_FSW_B;
5194 }
5195 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5196 }
5197 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5198 {
5199 pr80Dst->s.fSign = Tmp.s64.fSign;
5200 pr80Dst->s.uExponent = 0;
5201 pr80Dst->s.uMantissa = 0;
5202 }
5203 else if (RTFLOAT128U_IS_INF(&Tmp))
5204 {
5205 pr80Dst->s.fSign = Tmp.s64.fSign;
5206 pr80Dst->s.uExponent = 0;
5207 pr80Dst->s.uMantissa = 0;
5208 }
5209 return fFsw;
5210}
5211
5212
5213#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5214
5215/** Initializer for the SoftFloat state structure. */
5216# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5217 { \
5218 softfloat_tininess_afterRounding, \
5219 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5220 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5221 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5222 : (uint8_t)softfloat_round_minMag, \
5223 0, \
5224 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5225 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5226 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5227 }
5228
5229/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5230# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5231 ( (a_fFsw) \
5232 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5233 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5234 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5235 ? X86_FSW_ES | X86_FSW_B : 0) )
5236
5237
5238DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5239{
5240 RT_NOREF(fFcw);
5241 Assert(cBits > 64);
5242# if 0 /* rounding does not seem to help */
5243 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5244 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5245 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5246 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5247 {
5248 uint64_t uOld = r128.v[0];
5249 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5250 if (r128.v[0] < uOld)
5251 r128.v[1] += 1;
5252 }
5253# else
5254 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5255# endif
5256 return r128;
5257}
5258
5259
5260DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5261{
5262 RT_NOREF(fFcw);
5263 Assert(cBits > 64);
5264# if 0 /* rounding does not seem to help, not even on constants */
5265 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5266 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5267 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5268 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5269 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5270 {
5271 uint64_t uOld = r128.v[0];
5272 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5273 if (r128.v[0] < uOld)
5274 r128.v[1] += 1;
5275 }
5276 return r128;
5277# else
5278 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5279 return r128;
5280# endif
5281}
5282
5283
5284# if 0 /* unused */
5285DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5286{
5287 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5288 return r128;
5289}
5290# endif
5291
5292
5293/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5294DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5295{
5296 extFloat80_t Tmp;
5297 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5298 Tmp.signif = pr80Val->s2.uMantissa;
5299 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5300 return extF80_to_f128(Tmp, &Ignored);
5301}
5302
5303
5304/**
5305 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5306 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5307 *
5308 * This is only a structure format conversion, nothing else.
5309 */
5310DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5311{
5312 extFloat80_t Tmp;
5313 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5314 Tmp.signif = pr80Val->s2.uMantissa;
5315 return Tmp;
5316}
5317
5318
5319/**
5320 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5321 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5322 *
5323 * This is only a structure format conversion, nothing else.
5324 */
5325DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5326{
5327 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5328 pr80Dst->s2.uMantissa = r80XSrc.signif;
5329 return pr80Dst;
5330}
5331
5332
5333DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5334{
5335 RT_NOREF(fFcw);
5336 RTFLOAT128U Tmp;
5337 *(float128_t *)&Tmp = r128Src;
5338 ASMCompilerBarrier();
5339
5340 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5341 {
5342 pr80Dst->s.fSign = Tmp.s64.fSign;
5343 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5344 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5345 | Tmp.s64.uFractionLo >> (64 - 15);
5346
5347 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5348 unsigned const cShiftOff = 64 - 15;
5349 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5350 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5351 if (uRoundedOff)
5352 {
5353 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5354 ? RT_BIT_64(cShiftOff - 1)
5355 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5356 ? fRoundingOffMask
5357 : 0;
5358 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5359 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5360 || uRoundedOff != uRoundingAdd)
5361 {
5362 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5363 {
5364 uFraction += 1;
5365 if (!(uFraction & RT_BIT_64(63)))
5366 { /* likely */ }
5367 else
5368 {
5369 uFraction >>= 1;
5370 pr80Dst->s.uExponent++;
5371 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5372 return fFsw;
5373 }
5374 fFsw |= X86_FSW_C1;
5375 }
5376 }
5377 fFsw |= X86_FSW_PE;
5378 if (!(fFcw & X86_FCW_PM))
5379 fFsw |= X86_FSW_ES | X86_FSW_B;
5380 }
5381
5382 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5383 }
5384 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5385 {
5386 pr80Dst->s.fSign = Tmp.s64.fSign;
5387 pr80Dst->s.uExponent = 0;
5388 pr80Dst->s.uMantissa = 0;
5389 }
5390 else if (RTFLOAT128U_IS_INF(&Tmp))
5391 {
5392 pr80Dst->s.fSign = Tmp.s64.fSign;
5393 pr80Dst->s.uExponent = 0x7fff;
5394 pr80Dst->s.uMantissa = 0;
5395 }
5396 return fFsw;
5397}
5398
5399
5400/**
5401 * Helper for transfering exception and C1 to FSW and setting the result value
5402 * accordingly.
5403 *
5404 * @returns Updated FSW.
5405 * @param pSoftState The SoftFloat state following the operation.
5406 * @param r80XResult The result of the SoftFloat operation.
5407 * @param pr80Result Where to store the result for IEM.
5408 * @param fFcw The FPU control word.
5409 * @param fFsw The FSW before the operation, with necessary bits
5410 * cleared and such.
5411 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5412 * raised.
5413 */
5414DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5415 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5416 PCRTFLOAT80U pr80XcptResult)
5417{
5418 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5419 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5420 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5421 fFsw |= X86_FSW_ES | X86_FSW_B;
5422
5423 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5424 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5425 else
5426 {
5427 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5428 *pr80Result = *pr80XcptResult;
5429 }
5430 return fFsw;
5431}
5432
5433
5434/**
5435 * Helper doing polynomial evaluation using Horner's method.
5436 *
5437 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5438 */
5439float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5440 unsigned cPrecision, softfloat_state_t *pSoftState)
5441{
5442 Assert(cHornerConsts > 1);
5443 size_t i = cHornerConsts - 1;
5444 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5445 while (i-- > 0)
5446 {
5447 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5448 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5449 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5450 }
5451 return r128Result;
5452}
5453
5454#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5455
5456
5457/**
5458 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5459 * mantissa, exponent and sign.
5460 *
5461 * @returns Updated FSW.
5462 * @param pr80Dst Where to return the composed value.
5463 * @param fSign The sign.
5464 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5465 * ignored and should be zero. This will probably be
5466 * modified during normalization and rounding.
5467 * @param iExponent Unbiased exponent.
5468 * @param fFcw The FPU control word.
5469 * @param fFsw The FPU status word.
5470 */
5471static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5472 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5473{
5474 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5475
5476 iExponent += RTFLOAT80U_EXP_BIAS;
5477
5478 /* Do normalization if necessary and possible. */
5479 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5480 {
5481 int cShift = 192 - RTUInt256BitCount(puMantissa);
5482 if (iExponent > cShift)
5483 iExponent -= cShift;
5484 else
5485 {
5486 if (fFcw & X86_FCW_UM)
5487 {
5488 if (iExponent > 0)
5489 cShift = --iExponent;
5490 else
5491 cShift = 0;
5492 }
5493 iExponent -= cShift;
5494 }
5495 RTUInt256AssignShiftLeft(puMantissa, cShift);
5496 }
5497
5498 /* Do rounding. */
5499 uint64_t uMantissa = puMantissa->QWords.qw2;
5500 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5501 {
5502 bool fAdd;
5503 switch (fFcw & X86_FCW_RC_MASK)
5504 {
5505 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5506 case X86_FCW_RC_NEAREST:
5507 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5508 {
5509 if ( (uMantissa & 1)
5510 || puMantissa->QWords.qw0 != 0
5511 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5512 {
5513 fAdd = true;
5514 break;
5515 }
5516 uMantissa &= ~(uint64_t)1;
5517 }
5518 fAdd = false;
5519 break;
5520 case X86_FCW_RC_ZERO:
5521 fAdd = false;
5522 break;
5523 case X86_FCW_RC_UP:
5524 fAdd = !fSign;
5525 break;
5526 case X86_FCW_RC_DOWN:
5527 fAdd = fSign;
5528 break;
5529 }
5530 if (fAdd)
5531 {
5532 uint64_t const uTmp = uMantissa;
5533 uMantissa = uTmp + 1;
5534 if (uMantissa < uTmp)
5535 {
5536 uMantissa >>= 1;
5537 uMantissa |= RT_BIT_64(63);
5538 iExponent++;
5539 }
5540 fFsw |= X86_FSW_C1;
5541 }
5542 fFsw |= X86_FSW_PE;
5543 if (!(fFcw & X86_FCW_PM))
5544 fFsw |= X86_FSW_ES | X86_FSW_B;
5545 }
5546
5547 /* Check for underflow (denormals). */
5548 if (iExponent <= 0)
5549 {
5550 if (fFcw & X86_FCW_UM)
5551 {
5552 if (uMantissa & RT_BIT_64(63))
5553 uMantissa >>= 1;
5554 iExponent = 0;
5555 }
5556 else
5557 {
5558 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5559 fFsw |= X86_FSW_ES | X86_FSW_B;
5560 }
5561 fFsw |= X86_FSW_UE;
5562 }
5563 /* Check for overflow */
5564 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5565 {
5566 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5567 }
5568
5569 /* Compose the result. */
5570 pr80Dst->s.uMantissa = uMantissa;
5571 pr80Dst->s.uExponent = iExponent;
5572 pr80Dst->s.fSign = fSign;
5573 return fFsw;
5574}
5575
5576
5577/**
5578 * See also iemAImpl_fld_r80_from_r32
5579 */
5580static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5581{
5582 uint16_t fFsw = 0;
5583 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5584 {
5585 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5586 pr80Dst->sj64.fInteger = 1;
5587 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5588 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5589 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5590 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5591 }
5592 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5593 {
5594 pr80Dst->s.fSign = pr32Val->s.fSign;
5595 pr80Dst->s.uExponent = 0;
5596 pr80Dst->s.uMantissa = 0;
5597 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5598 }
5599 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5600 {
5601 /* Subnormal -> normalized + X86_FSW_DE return. */
5602 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5603 pr80Dst->sj64.fInteger = 1;
5604 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5605 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5606 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5607 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5608 fFsw = X86_FSW_DE;
5609 }
5610 else if (RTFLOAT32U_IS_INF(pr32Val))
5611 {
5612 pr80Dst->s.fSign = pr32Val->s.fSign;
5613 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5614 pr80Dst->s.uMantissa = RT_BIT_64(63);
5615 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5616 }
5617 else
5618 {
5619 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5620 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5621 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5622 pr80Dst->sj64.fInteger = 1;
5623 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5624 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5625 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5626 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5627 }
5628 return fFsw;
5629}
5630
5631
5632/**
5633 * See also iemAImpl_fld_r80_from_r64
5634 */
5635static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5636{
5637 uint16_t fFsw = 0;
5638 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5639 {
5640 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5641 pr80Dst->sj64.fInteger = 1;
5642 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5643 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5644 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5645 }
5646 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5647 {
5648 pr80Dst->s.fSign = pr64Val->s.fSign;
5649 pr80Dst->s.uExponent = 0;
5650 pr80Dst->s.uMantissa = 0;
5651 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5652 }
5653 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5654 {
5655 /* Subnormal values gets normalized. */
5656 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5657 pr80Dst->sj64.fInteger = 1;
5658 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5659 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5660 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5661 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5662 fFsw = X86_FSW_DE;
5663 }
5664 else if (RTFLOAT64U_IS_INF(pr64Val))
5665 {
5666 pr80Dst->s.fSign = pr64Val->s.fSign;
5667 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5668 pr80Dst->s.uMantissa = RT_BIT_64(63);
5669 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5670 }
5671 else
5672 {
5673 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5674 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5675 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5676 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5677 pr80Dst->sj64.fInteger = 1;
5678 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5679 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5680 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5681 }
5682 return fFsw;
5683}
5684
5685
5686/**
5687 * See also EMIT_FILD.
5688 */
5689#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5690static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5691{ \
5692 if (iVal == 0) \
5693 { \
5694 pr80Dst->s.fSign = 0; \
5695 pr80Dst->s.uExponent = 0; \
5696 pr80Dst->s.uMantissa = 0; \
5697 } \
5698 else \
5699 { \
5700 if (iVal > 0) \
5701 pr80Dst->s.fSign = 0; \
5702 else \
5703 { \
5704 pr80Dst->s.fSign = 1; \
5705 iVal = -iVal; \
5706 } \
5707 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5708 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5709 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5710 } \
5711 return pr80Dst; \
5712}
5713EMIT_CONVERT_IXX_TO_R80(16)
5714EMIT_CONVERT_IXX_TO_R80(32)
5715//EMIT_CONVERT_IXX_TO_R80(64)
5716
5717/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5718#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5719IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5720{ \
5721 RTFLOAT80U r80Val2; \
5722 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5723 Assert(!fFsw || fFsw == X86_FSW_DE); \
5724 if (fFsw) \
5725 { \
5726 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5727 fFsw = 0; \
5728 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5729 { \
5730 pFpuRes->r80Result = *pr80Val1; \
5731 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5732 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5733 return; \
5734 } \
5735 } \
5736 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5737 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5738}
5739
5740/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5741#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5742IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5743{ \
5744 RTFLOAT80U r80Val2; \
5745 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5746 Assert(!fFsw || fFsw == X86_FSW_DE); \
5747 if (fFsw) \
5748 { \
5749 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5750 fFsw = 0; \
5751 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5752 { \
5753 pFpuRes->r80Result = *pr80Val1; \
5754 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5755 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5756 return; \
5757 } \
5758 } \
5759 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5760 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5761}
5762
5763/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5764#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5765IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5766{ \
5767 RTFLOAT80U r80Val2; \
5768 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5769 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5770}
5771
5772/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5773#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5774IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5775{ \
5776 RTFLOAT80U r80Val2; \
5777 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5778 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5779}
5780
5781
5782
5783/*********************************************************************************************************************************
5784* x86 FPU Division Operations *
5785*********************************************************************************************************************************/
5786
5787/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5788static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5789 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5790{
5791 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5792 {
5793 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5794 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5795 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5796 }
5797 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5798 { /* Div by zero. */
5799 if (fFcw & X86_FCW_ZM)
5800 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5801 else
5802 {
5803 *pr80Result = *pr80Val1Org;
5804 fFsw |= X86_FSW_ES | X86_FSW_B;
5805 }
5806 fFsw |= X86_FSW_ZE;
5807 }
5808 else
5809 { /* Invalid operand */
5810 if (fFcw & X86_FCW_IM)
5811 *pr80Result = g_r80Indefinite;
5812 else
5813 {
5814 *pr80Result = *pr80Val1Org;
5815 fFsw |= X86_FSW_ES | X86_FSW_B;
5816 }
5817 fFsw |= X86_FSW_IE;
5818 }
5819 return fFsw;
5820}
5821
5822
5823IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5824 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5825{
5826 uint16_t const fFcw = pFpuState->FCW;
5827 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5828
5829 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5830 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5831 {
5832 if (fFcw & X86_FCW_IM)
5833 pFpuRes->r80Result = g_r80Indefinite;
5834 else
5835 {
5836 pFpuRes->r80Result = *pr80Val1;
5837 fFsw |= X86_FSW_ES | X86_FSW_B;
5838 }
5839 fFsw |= X86_FSW_IE;
5840 }
5841 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5842 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5843 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5844 {
5845 if (fFcw & X86_FCW_DM)
5846 {
5847 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5848 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5849 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5850 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5851 }
5852 else
5853 {
5854 pFpuRes->r80Result = *pr80Val1;
5855 fFsw |= X86_FSW_ES | X86_FSW_B;
5856 }
5857 fFsw |= X86_FSW_DE;
5858 }
5859 /* SoftFloat can handle the rest: */
5860 else
5861 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5862
5863 pFpuRes->FSW = fFsw;
5864}
5865
5866
5867EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5868EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5869EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5870EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5871
5872
5873IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5874 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5875{
5876 uint16_t const fFcw = pFpuState->FCW;
5877 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5878
5879 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5880 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5881 {
5882 if (fFcw & X86_FCW_IM)
5883 pFpuRes->r80Result = g_r80Indefinite;
5884 else
5885 {
5886 pFpuRes->r80Result = *pr80Val1;
5887 fFsw |= X86_FSW_ES | X86_FSW_B;
5888 }
5889 fFsw |= X86_FSW_IE;
5890 }
5891 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5892 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5893 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5894 {
5895 if (fFcw & X86_FCW_DM)
5896 {
5897 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5898 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5899 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5900 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5901 }
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_DE;
5908 }
5909 /* SoftFloat can handle the rest: */
5910 else
5911 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5912
5913 pFpuRes->FSW = fFsw;
5914}
5915
5916
5917EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5918EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5919EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5920EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5921
5922
5923/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5924static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5925 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5926{
5927 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5928 {
5929 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5930 uint16_t fCxFlags = 0;
5931 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5932 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5933 &fCxFlags, &SoftState);
5934 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5935 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5936 if ( !(fFsw & X86_FSW_IE)
5937 && !RTFLOAT80U_IS_NAN(pr80Result)
5938 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5939 {
5940 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5941 fFsw |= fCxFlags & X86_FSW_C_MASK;
5942 }
5943 return fFsw;
5944 }
5945
5946 /* Invalid operand */
5947 if (fFcw & X86_FCW_IM)
5948 *pr80Result = g_r80Indefinite;
5949 else
5950 {
5951 *pr80Result = *pr80Val1Org;
5952 fFsw |= X86_FSW_ES | X86_FSW_B;
5953 }
5954 return fFsw | X86_FSW_IE;
5955}
5956
5957
5958static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5959 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5960{
5961 uint16_t const fFcw = pFpuState->FCW;
5962 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5963
5964 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5965 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5966 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5967 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5968 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5969 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5970 {
5971 if (fFcw & X86_FCW_IM)
5972 pFpuRes->r80Result = g_r80Indefinite;
5973 else
5974 {
5975 pFpuRes->r80Result = *pr80Val1;
5976 fFsw |= X86_FSW_ES | X86_FSW_B;
5977 }
5978 fFsw |= X86_FSW_IE;
5979 }
5980 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5981 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5982 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5983 {
5984 if (fFcw & X86_FCW_DM)
5985 {
5986 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5987 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5988 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5989 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5990 pr80Val1Org, fLegacyInstr);
5991 }
5992 else
5993 {
5994 pFpuRes->r80Result = *pr80Val1;
5995 fFsw |= X86_FSW_ES | X86_FSW_B;
5996 }
5997 fFsw |= X86_FSW_DE;
5998 }
5999 /* SoftFloat can handle the rest: */
6000 else
6001 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6002 pr80Val1, fLegacyInstr);
6003
6004 pFpuRes->FSW = fFsw;
6005}
6006
6007
6008IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6009 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6010{
6011 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6012}
6013
6014
6015IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6016 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6017{
6018 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6019}
6020
6021
6022/*********************************************************************************************************************************
6023* x87 FPU Multiplication Operations *
6024*********************************************************************************************************************************/
6025
6026/** Worker for iemAImpl_fmul_r80_by_r80. */
6027static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6028 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6029{
6030 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6031 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6032 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6033}
6034
6035
6036IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6037 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6038{
6039 uint16_t const fFcw = pFpuState->FCW;
6040 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6041
6042 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6043 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6044 {
6045 if (fFcw & X86_FCW_IM)
6046 pFpuRes->r80Result = g_r80Indefinite;
6047 else
6048 {
6049 pFpuRes->r80Result = *pr80Val1;
6050 fFsw |= X86_FSW_ES | X86_FSW_B;
6051 }
6052 fFsw |= X86_FSW_IE;
6053 }
6054 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6055 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6056 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6057 {
6058 if (fFcw & X86_FCW_DM)
6059 {
6060 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6061 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6062 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6063 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6064 }
6065 else
6066 {
6067 pFpuRes->r80Result = *pr80Val1;
6068 fFsw |= X86_FSW_ES | X86_FSW_B;
6069 }
6070 fFsw |= X86_FSW_DE;
6071 }
6072 /* SoftFloat can handle the rest: */
6073 else
6074 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6075
6076 pFpuRes->FSW = fFsw;
6077}
6078
6079
6080EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6081EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6082EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6083EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6084
6085
6086/*********************************************************************************************************************************
6087* x87 FPU Addition *
6088*********************************************************************************************************************************/
6089
6090/** Worker for iemAImpl_fadd_r80_by_r80. */
6091static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6092 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6093{
6094 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6095 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6096 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6097}
6098
6099
6100IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6101 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6102{
6103 uint16_t const fFcw = pFpuState->FCW;
6104 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6105
6106 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6107 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6108 {
6109 if (fFcw & X86_FCW_IM)
6110 pFpuRes->r80Result = g_r80Indefinite;
6111 else
6112 {
6113 pFpuRes->r80Result = *pr80Val1;
6114 fFsw |= X86_FSW_ES | X86_FSW_B;
6115 }
6116 fFsw |= X86_FSW_IE;
6117 }
6118 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6119 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6120 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6121 {
6122 if (fFcw & X86_FCW_DM)
6123 {
6124 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6125 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6126 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6127 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6128 }
6129 else
6130 {
6131 pFpuRes->r80Result = *pr80Val1;
6132 fFsw |= X86_FSW_ES | X86_FSW_B;
6133 }
6134 fFsw |= X86_FSW_DE;
6135 }
6136 /* SoftFloat can handle the rest: */
6137 else
6138 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6139
6140 pFpuRes->FSW = fFsw;
6141}
6142
6143
6144EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6145EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6146EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6147EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6148
6149
6150/*********************************************************************************************************************************
6151* x87 FPU Subtraction *
6152*********************************************************************************************************************************/
6153
6154/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6155static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6156 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6157{
6158 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6159 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6160 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6161}
6162
6163
6164IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6165 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6166{
6167 uint16_t const fFcw = pFpuState->FCW;
6168 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6169
6170 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6171 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6172 {
6173 if (fFcw & X86_FCW_IM)
6174 pFpuRes->r80Result = g_r80Indefinite;
6175 else
6176 {
6177 pFpuRes->r80Result = *pr80Val1;
6178 fFsw |= X86_FSW_ES | X86_FSW_B;
6179 }
6180 fFsw |= X86_FSW_IE;
6181 }
6182 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6183 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6184 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6185 {
6186 if (fFcw & X86_FCW_DM)
6187 {
6188 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6189 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6190 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6191 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6192 }
6193 else
6194 {
6195 pFpuRes->r80Result = *pr80Val1;
6196 fFsw |= X86_FSW_ES | X86_FSW_B;
6197 }
6198 fFsw |= X86_FSW_DE;
6199 }
6200 /* SoftFloat can handle the rest: */
6201 else
6202 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6203
6204 pFpuRes->FSW = fFsw;
6205}
6206
6207
6208EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6209EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6210EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6211EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6212
6213
6214/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6215IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6216 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6217{
6218 uint16_t const fFcw = pFpuState->FCW;
6219 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6220
6221 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6222 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6223 {
6224 if (fFcw & X86_FCW_IM)
6225 pFpuRes->r80Result = g_r80Indefinite;
6226 else
6227 {
6228 pFpuRes->r80Result = *pr80Val1;
6229 fFsw |= X86_FSW_ES | X86_FSW_B;
6230 }
6231 fFsw |= X86_FSW_IE;
6232 }
6233 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6234 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6235 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6236 {
6237 if (fFcw & X86_FCW_DM)
6238 {
6239 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6240 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6241 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6242 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6243 }
6244 else
6245 {
6246 pFpuRes->r80Result = *pr80Val1;
6247 fFsw |= X86_FSW_ES | X86_FSW_B;
6248 }
6249 fFsw |= X86_FSW_DE;
6250 }
6251 /* SoftFloat can handle the rest: */
6252 else
6253 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6254
6255 pFpuRes->FSW = fFsw;
6256}
6257
6258
6259EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6260EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6261EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6262EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6263
6264
6265/*********************************************************************************************************************************
6266* x87 FPU Trigometric Operations *
6267*********************************************************************************************************************************/
6268static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6269{
6270 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6271 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6272 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6273 extFloat80_t v;
6274 (void)fFcw;
6275
6276 v = extF80_atan2(y, x, &SoftState);
6277
6278 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6279 return fFsw;
6280}
6281
6282IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6283 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6284{
6285 uint16_t const fFcw = pFpuState->FCW;
6286 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6287
6288 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6289 {
6290 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6291
6292 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6293 if (!(fFcw & X86_FCW_PM))
6294 fFsw |= X86_FSW_ES | X86_FSW_B;
6295 }
6296 else
6297 {
6298 fFsw |= X86_FSW_IE;
6299 if (!(fFcw & X86_FCW_IM))
6300 {
6301 pFpuRes->r80Result = *pr80Val2;
6302 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6303 }
6304 else
6305 {
6306 pFpuRes->r80Result = g_r80Indefinite;
6307 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6308 }
6309 }
6310
6311 pFpuRes->FSW = fFsw;
6312}
6313#endif /* IEM_WITHOUT_ASSEMBLY */
6314
6315IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6316 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6317{
6318 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6319}
6320
6321IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6322 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6323{
6324 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6325}
6326
6327
6328#if defined(IEM_WITHOUT_ASSEMBLY)
6329static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6330{
6331 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6332 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6333 extFloat80_t v;
6334 (void)fFcw;
6335
6336 v = extF80_tan(x, &SoftState);
6337
6338 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6339 return fFsw;
6340}
6341
6342IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6343{
6344 uint16_t const fFcw = pFpuState->FCW;
6345 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6346
6347 if (RTFLOAT80U_IS_ZERO(pr80Val))
6348 {
6349 pFpuResTwo->r80Result1 = *pr80Val;
6350 pFpuResTwo->r80Result2 = g_ar80One[0];
6351 }
6352 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6353 {
6354 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6355 {
6356 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6357 pFpuResTwo->r80Result1 = *pr80Val;
6358 }
6359 else
6360 {
6361 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6362 {
6363 pFpuResTwo->r80Result1 = *pr80Val;
6364 }
6365 else
6366 {
6367 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6368 }
6369
6370 pFpuResTwo->r80Result2 = g_ar80One[0];
6371
6372 fFsw |= X86_FSW_PE;
6373 if (!(fFcw & X86_FCW_PM))
6374 fFsw |= X86_FSW_ES | X86_FSW_B;
6375 }
6376 }
6377 else
6378 {
6379 fFsw |= X86_FSW_IE;
6380 if (!(fFcw & X86_FCW_IM))
6381 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6382 }
6383
6384 pFpuResTwo->FSW = fFsw;
6385}
6386#endif /* IEM_WITHOUT_ASSEMBLY */
6387
6388IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6389{
6390 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6391}
6392
6393IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6394{
6395 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6396}
6397
6398#ifdef IEM_WITHOUT_ASSEMBLY
6399
6400static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6401{
6402 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6403 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6404 extFloat80_t v;
6405 (void)fFcw;
6406
6407 v = extF80_sin(x, &SoftState);
6408
6409 iemFpuSoftF80ToIprt(pr80Result, v);
6410
6411 return fFsw;
6412}
6413
6414IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6415{
6416 uint16_t const fFcw = pFpuState->FCW;
6417 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6418
6419 if (RTFLOAT80U_IS_ZERO(pr80Val))
6420 {
6421 pFpuRes->r80Result = *pr80Val;
6422 }
6423 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6424 {
6425 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6426 {
6427 fFsw |= X86_FSW_C2;
6428 pFpuRes->r80Result = *pr80Val;
6429 }
6430 else
6431 {
6432 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6433 {
6434 pFpuRes->r80Result = *pr80Val;
6435 }
6436 else
6437 {
6438 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6439 }
6440 fFsw |= X86_FSW_PE;
6441 if (!(fFcw & X86_FCW_PM))
6442 fFsw |= X86_FSW_ES | X86_FSW_B;
6443 }
6444 }
6445 else if (RTFLOAT80U_IS_INF(pr80Val))
6446 {
6447 fFsw |= X86_FSW_IE;
6448 if (!(fFcw & X86_FCW_IM))
6449 {
6450 fFsw |= X86_FSW_ES | X86_FSW_B;
6451 pFpuRes->r80Result = *pr80Val;
6452 }
6453 else
6454 {
6455 pFpuRes->r80Result = g_r80Indefinite;
6456 }
6457 }
6458 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6459 {
6460 fFsw |= X86_FSW_DE;
6461
6462 if (fFcw & X86_FCW_DM)
6463 {
6464 if (fFcw & X86_FCW_UM)
6465 {
6466 pFpuRes->r80Result = *pr80Val;
6467 }
6468 else
6469 {
6470 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6471 uint64_t uMantissa = pr80Val->s.uMantissa;
6472 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6473
6474 uExponent = 64 - uExponent;
6475 uMantissa <<= uExponent;
6476 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6477
6478 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6479 pFpuRes->r80Result.s.uMantissa = uMantissa;
6480 pFpuRes->r80Result.s.uExponent = uExponent;
6481 }
6482
6483 fFsw |= X86_FSW_UE | X86_FSW_PE;
6484
6485 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6486 {
6487 /* All the exceptions are masked. */
6488 }
6489 else
6490 {
6491 fFsw |= X86_FSW_ES | X86_FSW_B;
6492 }
6493 }
6494 else
6495 {
6496 pFpuRes->r80Result = *pr80Val;
6497
6498 fFsw |= X86_FSW_ES | X86_FSW_B;
6499 }
6500 }
6501 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6502 {
6503 pFpuRes->r80Result = *pr80Val;
6504 fFsw |= X86_FSW_DE;
6505
6506 if (fFcw & X86_FCW_DM)
6507 {
6508 if (fFcw & X86_FCW_PM)
6509 {
6510 fFsw |= X86_FSW_PE;
6511 }
6512 else
6513 {
6514 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6515 }
6516
6517 pFpuRes->r80Result.sj64.uExponent = 1;
6518 }
6519 else
6520 {
6521 fFsw |= X86_FSW_ES | X86_FSW_B;
6522 }
6523 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6524 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6525 {
6526 pFpuRes->r80Result = *pr80Val;
6527 } else {
6528 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6529 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6530 && (fFcw & X86_FCW_IM))
6531 pFpuRes->r80Result = g_r80Indefinite;
6532 else
6533 {
6534 pFpuRes->r80Result = *pr80Val;
6535 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6536 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6537 }
6538
6539 fFsw |= X86_FSW_IE;
6540 if (!(fFcw & X86_FCW_IM))
6541 fFsw |= X86_FSW_ES | X86_FSW_B;
6542 }
6543
6544 pFpuRes->FSW = fFsw;
6545}
6546#endif /* IEM_WITHOUT_ASSEMBLY */
6547
6548IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6549{
6550 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6551}
6552
6553IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6554{
6555 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6556}
6557
6558#ifdef IEM_WITHOUT_ASSEMBLY
6559
6560static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6561{
6562 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6563 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6564 extFloat80_t v;
6565 (void)fFcw;
6566
6567 v = extF80_cos(x, &SoftState);
6568
6569 iemFpuSoftF80ToIprt(pr80Result, v);
6570
6571 return fFsw;
6572}
6573
6574IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6575{
6576 uint16_t const fFcw = pFpuState->FCW;
6577 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6578
6579 if (RTFLOAT80U_IS_ZERO(pr80Val))
6580 {
6581 pFpuRes->r80Result = g_ar80One[0];
6582 }
6583 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6584 {
6585 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6586 {
6587 fFsw |= X86_FSW_C2;
6588 pFpuRes->r80Result = *pr80Val;
6589 }
6590 else
6591 {
6592 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6593 {
6594 pFpuRes->r80Result = g_ar80One[0];
6595
6596 }
6597 else
6598 {
6599 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6600 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6601 }
6602 fFsw |= X86_FSW_PE;
6603 if (!(fFcw & X86_FCW_PM))
6604 fFsw |= X86_FSW_ES | X86_FSW_B;
6605 }
6606 }
6607 else if (RTFLOAT80U_IS_INF(pr80Val))
6608 {
6609 fFsw |= X86_FSW_IE;
6610 if (!(fFcw & X86_FCW_IM))
6611 {
6612 fFsw |= X86_FSW_ES | X86_FSW_B;
6613 pFpuRes->r80Result = *pr80Val;
6614 }
6615 else
6616 {
6617 pFpuRes->r80Result = g_r80Indefinite;
6618 }
6619 }
6620 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6621 {
6622 fFsw |= X86_FSW_DE;
6623
6624 if (fFcw & X86_FCW_DM)
6625 {
6626 pFpuRes->r80Result = g_ar80One[0];
6627
6628 if (fFcw & X86_FCW_PM)
6629 {
6630 fFsw |= X86_FSW_PE;
6631 }
6632 else
6633 {
6634 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6635 }
6636 }
6637 else
6638 {
6639 pFpuRes->r80Result = *pr80Val;
6640 fFsw |= X86_FSW_ES | X86_FSW_B;
6641 }
6642 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6643 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6644 {
6645 pFpuRes->r80Result = *pr80Val;
6646 } else {
6647 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6648 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6649 && (fFcw & X86_FCW_IM))
6650 pFpuRes->r80Result = g_r80Indefinite;
6651 else
6652 {
6653 pFpuRes->r80Result = *pr80Val;
6654 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6655 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6656 }
6657
6658 fFsw |= X86_FSW_IE;
6659 if (!(fFcw & X86_FCW_IM))
6660 fFsw |= X86_FSW_ES | X86_FSW_B;
6661 }
6662
6663 pFpuRes->FSW = fFsw;
6664}
6665#endif /* IEM_WITHOUT_ASSEMBLY */
6666
6667IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6668{
6669 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6670}
6671
6672IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6673{
6674 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6675}
6676
6677#ifdef IEM_WITHOUT_ASSEMBLY
6678
6679static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6680{
6681 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6682 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6683 extFloat80_t r80Sin, r80Cos;
6684 (void)fFcw;
6685
6686 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6687
6688 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6689 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6690
6691 return fFsw;
6692}
6693
6694IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6695{
6696 uint16_t const fFcw = pFpuState->FCW;
6697 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6698
6699 if (RTFLOAT80U_IS_ZERO(pr80Val))
6700 {
6701 pFpuResTwo->r80Result1 = *pr80Val;
6702 pFpuResTwo->r80Result2 = g_ar80One[0];
6703 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6704 }
6705 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6706 {
6707 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6708 {
6709 fFsw |= X86_FSW_C2;
6710
6711 if (fFcw & X86_FCW_IM)
6712 {
6713 pFpuResTwo->r80Result1 = g_r80Indefinite;
6714 }
6715 else
6716 {
6717 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6718 }
6719
6720 pFpuResTwo->r80Result2 = *pr80Val;
6721 }
6722 else
6723 {
6724 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6725
6726 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6727 {
6728 pFpuResTwo->r80Result1 = *pr80Val;
6729 pFpuResTwo->r80Result2 = g_ar80One[0];
6730 }
6731 else
6732 {
6733 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6734 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6735 }
6736 fFsw |= X86_FSW_PE;
6737 if (!(fFcw & X86_FCW_PM))
6738 fFsw |= X86_FSW_ES | X86_FSW_B;
6739 }
6740 }
6741 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6742 {
6743 fFsw |= X86_FSW_DE;
6744
6745 if (fFcw & X86_FCW_DM)
6746 {
6747 pFpuResTwo->r80Result1 = *pr80Val;
6748 pFpuResTwo->r80Result2 = g_ar80One[0];
6749 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6750
6751 if (fFcw & X86_FCW_PM)
6752 {
6753 fFsw |= X86_FSW_PE;
6754 }
6755 else
6756 {
6757 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6758 }
6759
6760 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6761 }
6762 else
6763 {
6764 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6765 pFpuResTwo->r80Result2 = *pr80Val;
6766 fFsw |= X86_FSW_ES | X86_FSW_B;
6767 }
6768 }
6769 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6770 {
6771 fFsw |= X86_FSW_DE;
6772
6773 if (fFcw & X86_FCW_DM)
6774 {
6775 pFpuResTwo->r80Result2 = g_ar80One[0];
6776
6777 if (fFcw & X86_FCW_UM)
6778 {
6779 pFpuResTwo->r80Result1 = *pr80Val;
6780 }
6781 else
6782 {
6783 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6784 uint64_t uMantissa = pr80Val->s.uMantissa;
6785 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6786
6787 uExponent = 64 - uExponent;
6788 uMantissa <<= uExponent;
6789 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6790
6791 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6792 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6793 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6794 }
6795
6796 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6797 fFsw |= X86_FSW_UE | X86_FSW_PE;
6798
6799 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6800 {
6801 /* All the exceptions are masked. */
6802 }
6803 else
6804 {
6805 fFsw |= X86_FSW_ES | X86_FSW_B;
6806 }
6807 }
6808 else
6809 {
6810 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6811 pFpuResTwo->r80Result2 = *pr80Val;
6812 fFsw |= X86_FSW_ES | X86_FSW_B;
6813 }
6814 }
6815 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6816 {
6817 pFpuResTwo->r80Result1 = *pr80Val;
6818 pFpuResTwo->r80Result2 = *pr80Val;
6819 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6820 }
6821 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6822 {
6823 if (fFcw & X86_FCW_IM)
6824 {
6825 pFpuResTwo->r80Result1 = g_r80Indefinite;
6826 pFpuResTwo->r80Result2 = g_r80Indefinite;
6827 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6828 }
6829 else
6830 {
6831 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6832 pFpuResTwo->r80Result2 = *pr80Val;
6833 }
6834
6835 fFsw |= X86_FSW_IE;
6836 if (!(fFcw & X86_FCW_IM))
6837 fFsw |= X86_FSW_ES | X86_FSW_B;
6838 }
6839 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6840 {
6841 pFpuResTwo->r80Result1 = *pr80Val;
6842 pFpuResTwo->r80Result2 = *pr80Val;
6843
6844 if (fFcw & X86_FCW_IM)
6845 {
6846 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6847 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6848 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6849 }
6850 else
6851 {
6852 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6853 pFpuResTwo->r80Result2 = *pr80Val;
6854 }
6855
6856 fFsw |= X86_FSW_IE;
6857 if (!(fFcw & X86_FCW_IM))
6858 fFsw |= X86_FSW_ES | X86_FSW_B;
6859 }
6860 else if (RTFLOAT80U_IS_INF(pr80Val))
6861 {
6862 if (fFcw & X86_FCW_IM)
6863 {
6864 pFpuResTwo->r80Result1 = g_r80Indefinite;
6865 pFpuResTwo->r80Result2 = g_r80Indefinite;
6866 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6867 }
6868 else
6869 {
6870 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6871 pFpuResTwo->r80Result2 = *pr80Val;
6872 }
6873
6874 fFsw |= X86_FSW_IE;
6875 if (!(fFcw & X86_FCW_IM))
6876 fFsw |= X86_FSW_ES | X86_FSW_B;
6877 }
6878
6879 pFpuResTwo->FSW = fFsw;
6880}
6881#endif /* IEM_WITHOUT_ASSEMBLY */
6882
6883IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6884{
6885 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6886}
6887
6888IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6889{
6890 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6891}
6892
6893#ifdef IEM_WITHOUT_ASSEMBLY
6894
6895
6896/*********************************************************************************************************************************
6897* x87 FPU Compare and Testing Operations *
6898*********************************************************************************************************************************/
6899
6900IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6901{
6902 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6903
6904 if (RTFLOAT80U_IS_ZERO(pr80Val))
6905 fFsw |= X86_FSW_C3;
6906 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6907 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6908 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6909 {
6910 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6911 if (!(pFpuState->FCW & X86_FCW_DM))
6912 fFsw |= X86_FSW_ES | X86_FSW_B;
6913 }
6914 else
6915 {
6916 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6917 if (!(pFpuState->FCW & X86_FCW_IM))
6918 fFsw |= X86_FSW_ES | X86_FSW_B;
6919 }
6920
6921 *pu16Fsw = fFsw;
6922}
6923
6924
6925IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6926{
6927 RT_NOREF(pFpuState);
6928 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6929
6930 /* C1 = sign bit (always, even if empty Intel says). */
6931 if (pr80Val->s.fSign)
6932 fFsw |= X86_FSW_C1;
6933
6934 /* Classify the value in C0, C2, C3. */
6935 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6936 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6937 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6938 fFsw |= X86_FSW_C2;
6939 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6940 fFsw |= X86_FSW_C3;
6941 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6942 fFsw |= X86_FSW_C0;
6943 else if (RTFLOAT80U_IS_INF(pr80Val))
6944 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6945 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6946 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6947 /* whatever else: 0 */
6948
6949 *pu16Fsw = fFsw;
6950}
6951
6952
6953/**
6954 * Worker for fcom, fucom, and friends.
6955 */
6956static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6957 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6958{
6959 /*
6960 * Unpack the values.
6961 */
6962 bool const fSign1 = pr80Val1->s.fSign;
6963 int32_t iExponent1 = pr80Val1->s.uExponent;
6964 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6965
6966 bool const fSign2 = pr80Val2->s.fSign;
6967 int32_t iExponent2 = pr80Val2->s.uExponent;
6968 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6969
6970 /*
6971 * Check for invalid inputs.
6972 */
6973 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6974 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6975 {
6976 if (!(fFcw & X86_FCW_IM))
6977 fFsw |= X86_FSW_ES | X86_FSW_B;
6978 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6979 }
6980
6981 /*
6982 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6983 */
6984 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6985 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6986 {
6987 if ( fIeOnAllNaNs
6988 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6989 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6990 {
6991 fFsw |= X86_FSW_IE;
6992 if (!(fFcw & X86_FCW_IM))
6993 fFsw |= X86_FSW_ES | X86_FSW_B;
6994 }
6995 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6996 }
6997
6998 /*
6999 * Normalize the values.
7000 */
7001 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7002 {
7003 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7004 iExponent1 = 1;
7005 else
7006 {
7007 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7008 uMantissa1 <<= iExponent1;
7009 iExponent1 = 1 - iExponent1;
7010 }
7011 fFsw |= X86_FSW_DE;
7012 if (!(fFcw & X86_FCW_DM))
7013 fFsw |= X86_FSW_ES | X86_FSW_B;
7014 }
7015
7016 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7017 {
7018 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7019 iExponent2 = 1;
7020 else
7021 {
7022 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7023 uMantissa2 <<= iExponent2;
7024 iExponent2 = 1 - iExponent2;
7025 }
7026 fFsw |= X86_FSW_DE;
7027 if (!(fFcw & X86_FCW_DM))
7028 fFsw |= X86_FSW_ES | X86_FSW_B;
7029 }
7030
7031 /*
7032 * Test if equal (val1 == val2):
7033 */
7034 if ( uMantissa1 == uMantissa2
7035 && iExponent1 == iExponent2
7036 && ( fSign1 == fSign2
7037 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7038 fFsw |= X86_FSW_C3;
7039 /*
7040 * Test if less than (val1 < val2):
7041 */
7042 else if (fSign1 && !fSign2)
7043 fFsw |= X86_FSW_C0;
7044 else if (fSign1 == fSign2)
7045 {
7046 /* Zeros are problematic, however at the most one can be zero here. */
7047 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7048 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7049 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7050 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7051
7052 if ( fSign1
7053 ^ ( iExponent1 < iExponent2
7054 || ( iExponent1 == iExponent2
7055 && uMantissa1 < uMantissa2 ) ) )
7056 fFsw |= X86_FSW_C0;
7057 }
7058 /* else: No flags set if greater. */
7059
7060 return fFsw;
7061}
7062
7063
7064IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7065 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7066{
7067 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7068}
7069
7070
7071
7072
7073IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7074 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7075{
7076 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7077}
7078
7079
7080IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7081 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7082{
7083 RTFLOAT80U r80Val2;
7084 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7085 Assert(!fFsw || fFsw == X86_FSW_DE);
7086 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7087 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7088 {
7089 if (!(pFpuState->FCW & X86_FCW_DM))
7090 fFsw |= X86_FSW_ES | X86_FSW_B;
7091 *pfFsw |= fFsw;
7092 }
7093}
7094
7095
7096IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7097 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7098{
7099 RTFLOAT80U r80Val2;
7100 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7101 Assert(!fFsw || fFsw == X86_FSW_DE);
7102 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7103 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7104 {
7105 if (!(pFpuState->FCW & X86_FCW_DM))
7106 fFsw |= X86_FSW_ES | X86_FSW_B;
7107 *pfFsw |= fFsw;
7108 }
7109}
7110
7111
7112IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7113 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7114{
7115 RTFLOAT80U r80Val2;
7116 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7117 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7118}
7119
7120
7121IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7122 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7123{
7124 RTFLOAT80U r80Val2;
7125 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7126 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7127}
7128
7129
7130/**
7131 * Worker for fcomi & fucomi.
7132 */
7133static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7134 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7135{
7136 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7137 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7138 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7139 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7140
7141 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7142 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7143 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7144}
7145
7146
7147IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7148 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7149{
7150 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7151}
7152
7153
7154IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7155 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7156{
7157 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7158}
7159
7160
7161/*********************************************************************************************************************************
7162* x87 FPU Other Operations *
7163*********************************************************************************************************************************/
7164
7165/**
7166 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7167 */
7168static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7169{
7170 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7171 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7172 true /*exact / generate #PE */, &SoftState));
7173 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7174}
7175
7176
7177IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7178{
7179 uint16_t const fFcw = pFpuState->FCW;
7180 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7181
7182 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7183 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7184 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7185 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7186 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7187 || RTFLOAT80U_IS_INF(pr80Val))
7188 pFpuRes->r80Result = *pr80Val;
7189 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7190 {
7191 fFsw |= X86_FSW_DE;
7192 if (fFcw & X86_FCW_DM)
7193 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7194 else
7195 {
7196 pFpuRes->r80Result = *pr80Val;
7197 fFsw |= X86_FSW_ES | X86_FSW_B;
7198 }
7199 }
7200 else
7201 {
7202 if (fFcw & X86_FCW_IM)
7203 {
7204 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7205 pFpuRes->r80Result = g_r80Indefinite;
7206 else
7207 {
7208 pFpuRes->r80Result = *pr80Val;
7209 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7210 }
7211 }
7212 else
7213 {
7214 pFpuRes->r80Result = *pr80Val;
7215 fFsw |= X86_FSW_ES | X86_FSW_B;
7216 }
7217 fFsw |= X86_FSW_IE;
7218 }
7219 pFpuRes->FSW = fFsw;
7220}
7221
7222
7223IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7224 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7225{
7226 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7227 it does everything we need it to do. */
7228 uint16_t const fFcw = pFpuState->FCW;
7229 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7230 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7231 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7232 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7233}
7234
7235
7236/**
7237 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7238 */
7239static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7240{
7241 Assert(!pr80Val->s.fSign);
7242 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7243 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7244 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7245}
7246
7247
7248IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7249{
7250 uint16_t const fFcw = pFpuState->FCW;
7251 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7252
7253 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7254 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7255 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7256 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7257 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7258 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7259 pFpuRes->r80Result = *pr80Val;
7260 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7261 {
7262 fFsw |= X86_FSW_DE;
7263 if (fFcw & X86_FCW_DM)
7264 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7265 else
7266 {
7267 pFpuRes->r80Result = *pr80Val;
7268 fFsw |= X86_FSW_ES | X86_FSW_B;
7269 }
7270 }
7271 else
7272 {
7273 if (fFcw & X86_FCW_IM)
7274 {
7275 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7276 pFpuRes->r80Result = g_r80Indefinite;
7277 else
7278 {
7279 pFpuRes->r80Result = *pr80Val;
7280 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7281 }
7282 }
7283 else
7284 {
7285 pFpuRes->r80Result = *pr80Val;
7286 fFsw |= X86_FSW_ES | X86_FSW_B;
7287 }
7288 fFsw |= X86_FSW_IE;
7289 }
7290 pFpuRes->FSW = fFsw;
7291}
7292
7293
7294/**
7295 * @code{.unparsed}
7296 * x x * ln2
7297 * f(x) = 2 - 1 = e - 1
7298 *
7299 * @endcode
7300 *
7301 * We can approximate e^x by a Taylor/Maclaurin series (see
7302 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7303 * @code{.unparsed}
7304 * n 0 1 2 3 4
7305 * inf x x x x x x
7306 * SUM ----- = --- + --- + --- + --- + --- + ...
7307 * n=0 n! 0! 1! 2! 3! 4!
7308 *
7309 * 2 3 4
7310 * x x x
7311 * = 1 + x + --- + --- + --- + ...
7312 * 2! 3! 4!
7313 * @endcode
7314 *
7315 * Given z = x * ln2, we get:
7316 * @code{.unparsed}
7317 * 2 3 4 n
7318 * z z z z z
7319 * e - 1 = z + --- + --- + --- + ... + ---
7320 * 2! 3! 4! n!
7321 * @endcode
7322 *
7323 * Wanting to use Horner's method, we move one z outside and get:
7324 * @code{.unparsed}
7325 * 2 3 (n-1)
7326 * z z z z
7327 * = z ( 1 + --- + --- + --- + ... + ------- )
7328 * 2! 3! 4! n!
7329 * @endcode
7330 *
7331 * The constants we need for using Horner's methods are 1 and 1 / n!.
7332 *
7333 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7334 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7335 * and can approximate it to be 1.0. For a visual demonstration of this
7336 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7337 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7338 *
7339 *
7340 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7341 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7342 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7343 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7344 * blocks). (The one bit difference is probably an implicit one missing from
7345 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7346 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7347 * exponent.
7348 *
7349 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7350 * successfully reproduced the exact results from an Intel 10980XE, there is
7351 * always a portition of rounding differences. Not going to spend too much time
7352 * on getting this 100% the same, at least not now.
7353 *
7354 * P.S. If someone are really curious about 8087 and its contstants:
7355 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7356 *
7357 *
7358 * @param pr80Val The exponent value (x), less than 1.0, greater than
7359 * -1.0 and not zero. This can be a normal, denormal
7360 * or pseudo-denormal value.
7361 * @param pr80Result Where to return the result.
7362 * @param fFcw FPU control word.
7363 * @param fFsw FPU status word.
7364 */
7365static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7366{
7367 /* As mentioned above, we can skip the expensive polynomial calculation
7368 as it will be close enough to 1.0 that it makes no difference.
7369
7370 The cutoff point for intel 10980XE is exponents >= -69. Intel
7371 also seems to be using a 67-bit or 68-bit constant value, and we get
7372 a smattering of rounding differences if we go for higher precision. */
7373 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7374 {
7375 RTUINT256U u256;
7376 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7377 u256.QWords.qw0 |= 1; /* force #PE */
7378 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7379 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7380 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7381 : 1 - RTFLOAT80U_EXP_BIAS,
7382 fFcw, fFsw);
7383 }
7384 else
7385 {
7386#ifdef IEM_WITH_FLOAT128_FOR_FPU
7387 /* This approach is not good enough for small values, we end up with zero. */
7388 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7389 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7390 _Float128 rd128Result = powf128(2.0L, rd128Val);
7391 rd128Result -= 1.0L;
7392 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7393 iemFpuF128RestoreRounding(fOldRounding);
7394
7395# else
7396 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7397 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7398
7399 /* As mentioned above, enforce 68-bit internal mantissa width to better
7400 match the Intel 10980XE results. */
7401 unsigned const cPrecision = 68;
7402
7403 /* first calculate z = x * ln2 */
7404 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7405 cPrecision);
7406
7407 /* Then do the polynomial evaluation. */
7408 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7409 cPrecision, &SoftState);
7410 r = f128_mul(z, r, &SoftState);
7411
7412 /* Output the result. */
7413 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7414# endif
7415 }
7416 return fFsw;
7417}
7418
7419
7420IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7421{
7422 uint16_t const fFcw = pFpuState->FCW;
7423 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7424
7425 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7426 {
7427 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7428 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7429 else
7430 {
7431 /* Special case:
7432 2^+1.0 - 1.0 = 1.0
7433 2^-1.0 - 1.0 = -0.5 */
7434 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7435 && pr80Val->s.uMantissa == RT_BIT_64(63))
7436 {
7437 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7438 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7439 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7440 }
7441 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7442 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7443 else
7444 pFpuRes->r80Result = *pr80Val;
7445 fFsw |= X86_FSW_PE;
7446 if (!(fFcw & X86_FCW_PM))
7447 fFsw |= X86_FSW_ES | X86_FSW_B;
7448 }
7449 }
7450 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7451 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7452 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7453 pFpuRes->r80Result = *pr80Val;
7454 else if (RTFLOAT80U_IS_INF(pr80Val))
7455 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7456 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7457 {
7458 fFsw |= X86_FSW_DE;
7459 if (fFcw & X86_FCW_DM)
7460 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7461 else
7462 {
7463 pFpuRes->r80Result = *pr80Val;
7464 fFsw |= X86_FSW_ES | X86_FSW_B;
7465 }
7466 }
7467 else
7468 {
7469 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7470 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7471 && (fFcw & X86_FCW_IM))
7472 pFpuRes->r80Result = g_r80Indefinite;
7473 else
7474 {
7475 pFpuRes->r80Result = *pr80Val;
7476 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7477 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7478 }
7479 fFsw |= X86_FSW_IE;
7480 if (!(fFcw & X86_FCW_IM))
7481 fFsw |= X86_FSW_ES | X86_FSW_B;
7482 }
7483 pFpuRes->FSW = fFsw;
7484}
7485
7486#endif /* IEM_WITHOUT_ASSEMBLY */
7487
7488IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7489{
7490 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7491}
7492
7493IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7494{
7495 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7496}
7497
7498#ifdef IEM_WITHOUT_ASSEMBLY
7499
7500IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7501{
7502 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7503 pFpuRes->r80Result = *pr80Val;
7504 pFpuRes->r80Result.s.fSign = 0;
7505}
7506
7507
7508IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7509{
7510 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7511 pFpuRes->r80Result = *pr80Val;
7512 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7513}
7514
7515
7516IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7517{
7518 uint16_t const fFcw = pFpuState->FCW;
7519 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7520
7521 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7522 {
7523 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7524 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7525
7526 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7527 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7528 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7529 }
7530 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7531 {
7532 fFsw |= X86_FSW_ZE;
7533 if (fFcw & X86_FCW_ZM)
7534 {
7535 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7536 pFpuResTwo->r80Result2 = *pr80Val;
7537 }
7538 else
7539 {
7540 pFpuResTwo->r80Result2 = *pr80Val;
7541 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7542 }
7543 }
7544 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7545 {
7546 fFsw |= X86_FSW_DE;
7547 if (fFcw & X86_FCW_DM)
7548 {
7549 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7550 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7551 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7552 int32_t iExponent = -16382;
7553 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7554 {
7555 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7556 iExponent--;
7557 }
7558
7559 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7560 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7561 }
7562 else
7563 {
7564 pFpuResTwo->r80Result2 = *pr80Val;
7565 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7566 }
7567 }
7568 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7569 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7570 {
7571 pFpuResTwo->r80Result1 = *pr80Val;
7572 pFpuResTwo->r80Result2 = *pr80Val;
7573 }
7574 else if (RTFLOAT80U_IS_INF(pr80Val))
7575 {
7576 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7577 pFpuResTwo->r80Result2 = *pr80Val;
7578 }
7579 else
7580 {
7581 if (fFcw & X86_FCW_IM)
7582 {
7583 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7584 pFpuResTwo->r80Result1 = g_r80Indefinite;
7585 else
7586 {
7587 pFpuResTwo->r80Result1 = *pr80Val;
7588 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7589 }
7590 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7591 }
7592 else
7593 {
7594 pFpuResTwo->r80Result2 = *pr80Val;
7595 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7596 }
7597 fFsw |= X86_FSW_IE;
7598 }
7599 pFpuResTwo->FSW = fFsw;
7600}
7601#endif /* IEM_WITHOUT_ASSEMBLY */
7602
7603#if defined(IEM_WITHOUT_ASSEMBLY)
7604
7605static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7606{
7607 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7608 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7609 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7610 extFloat80_t v;
7611 (void)fFcw;
7612
7613 v = extF80_ylog2x(y, x, &SoftState);
7614 iemFpuSoftF80ToIprt(pr80Result, v);
7615
7616 return fFsw;
7617}
7618
7619IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7620 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7621{
7622 uint16_t const fFcw = pFpuState->FCW;
7623 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7624
7625 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7626 {
7627 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7628
7629 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7630 if (!(fFcw & X86_FCW_PM))
7631 fFsw |= X86_FSW_ES | X86_FSW_B;
7632 }
7633 else
7634 {
7635 fFsw |= X86_FSW_IE;
7636
7637 if (!(fFcw & X86_FCW_IM))
7638 {
7639 pFpuRes->r80Result = *pr80Val2;
7640 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7641 }
7642 else
7643 {
7644 pFpuRes->r80Result = g_r80Indefinite;
7645 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7646 }
7647 }
7648
7649 pFpuRes->FSW = fFsw;
7650}
7651#endif /* IEM_WITHOUT_ASSEMBLY */
7652
7653IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7654 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7655{
7656 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7657}
7658
7659IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7660 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7661{
7662 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7663}
7664
7665#if defined(IEM_WITHOUT_ASSEMBLY)
7666
7667static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7668{
7669 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7670 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7671 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7672 extFloat80_t v;
7673 (void)fFcw;
7674
7675 v = extF80_ylog2xp1(y, x, &SoftState);
7676 iemFpuSoftF80ToIprt(pr80Result, v);
7677
7678 return fFsw;
7679}
7680
7681IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7682 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7683{
7684 uint16_t const fFcw = pFpuState->FCW;
7685 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7686
7687 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7688 {
7689 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7690
7691 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7692 if (!(fFcw & X86_FCW_PM))
7693 fFsw |= X86_FSW_ES | X86_FSW_B;
7694 }
7695 else
7696 {
7697 fFsw |= X86_FSW_IE;
7698
7699 if (!(fFcw & X86_FCW_IM))
7700 {
7701 pFpuRes->r80Result = *pr80Val2;
7702 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7703 }
7704 else
7705 {
7706 pFpuRes->r80Result = g_r80Indefinite;
7707 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7708 }
7709 }
7710
7711 pFpuRes->FSW = fFsw;
7712}
7713
7714#endif /* IEM_WITHOUT_ASSEMBLY */
7715
7716IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7717 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7718{
7719 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7720}
7721
7722IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7723 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7724{
7725 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7726}
7727
7728
7729/*********************************************************************************************************************************
7730* MMX, SSE & AVX *
7731*********************************************************************************************************************************/
7732
7733#ifdef IEM_WITH_VEX
7734
7735/*
7736 * VMOVSLDUP
7737 */
7738IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7739{
7740 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7741 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7742 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7743 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7744 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7745 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7746 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7747 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7748}
7749
7750
7751IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7752{
7753 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7754 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7755 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7756 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7757 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7758 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7759 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7760 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7761}
7762
7763#endif /* IEM_WITH_VEX */
7764
7765
7766#ifdef IEM_WITH_VEX
7767
7768/*
7769 * VMOVSHDUP
7770 */
7771IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7772{
7773 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7774 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7775 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7776 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7777 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7778 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7779 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7780 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7781}
7782
7783
7784IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7785{
7786 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7787 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7788 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7789 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7790 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7791 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7792 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7793 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7794}
7795
7796#endif /* IEM_WITH_VEX */
7797
7798
7799#ifdef IEM_WITH_VEX
7800
7801/*
7802 * VMOVDDUP
7803 */
7804IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7805{
7806 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7807 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7808 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7809 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7810}
7811
7812IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7813{
7814 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7815 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7816 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7817 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7818}
7819
7820#endif /* IEM_WITH_VEX */
7821
7822
7823/*
7824 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7825 */
7826#ifdef IEM_WITHOUT_ASSEMBLY
7827
7828IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7829{
7830 RT_NOREF(pFpuState);
7831 *puDst &= *puSrc;
7832}
7833
7834
7835IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7836{
7837 RT_NOREF(pFpuState);
7838 puDst->au64[0] &= puSrc->au64[0];
7839 puDst->au64[1] &= puSrc->au64[1];
7840}
7841
7842#endif
7843
7844IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7845 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7846{
7847 RT_NOREF(pExtState);
7848 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7849 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7850}
7851
7852
7853IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7854 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7855{
7856 RT_NOREF(pExtState);
7857 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7858 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7859 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7860 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7861}
7862
7863
7864/*
7865 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7866 */
7867#ifdef IEM_WITHOUT_ASSEMBLY
7868
7869IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7870{
7871 RT_NOREF(pFpuState);
7872 *puDst = ~*puDst & *puSrc;
7873}
7874
7875
7876IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7877{
7878 RT_NOREF(pFpuState);
7879 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7880 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7881}
7882
7883#endif
7884
7885IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7886 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7887{
7888 RT_NOREF(pExtState);
7889 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7890 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7891}
7892
7893
7894IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7895 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7896{
7897 RT_NOREF(pExtState);
7898 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7899 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7900 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7901 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7902}
7903
7904
7905/*
7906 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7907 */
7908#ifdef IEM_WITHOUT_ASSEMBLY
7909
7910IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7911{
7912 RT_NOREF(pFpuState);
7913 *puDst |= *puSrc;
7914}
7915
7916
7917IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7918{
7919 RT_NOREF(pFpuState);
7920 puDst->au64[0] |= puSrc->au64[0];
7921 puDst->au64[1] |= puSrc->au64[1];
7922}
7923
7924#endif
7925
7926IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7927 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7928{
7929 RT_NOREF(pExtState);
7930 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7931 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7932}
7933
7934
7935IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7936 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7937{
7938 RT_NOREF(pExtState);
7939 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7940 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7941 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7942 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7943}
7944
7945
7946/*
7947 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7948 */
7949#ifdef IEM_WITHOUT_ASSEMBLY
7950
7951IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7952{
7953 RT_NOREF(pFpuState);
7954 *puDst ^= *puSrc;
7955}
7956
7957
7958IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7959{
7960 RT_NOREF(pFpuState);
7961 puDst->au64[0] ^= puSrc->au64[0];
7962 puDst->au64[1] ^= puSrc->au64[1];
7963}
7964
7965#endif
7966
7967IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7968 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7969{
7970 RT_NOREF(pExtState);
7971 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7972 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7973}
7974
7975
7976IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7977 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7978{
7979 RT_NOREF(pExtState);
7980 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7981 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7982 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7983 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7984}
7985
7986
7987/*
7988 * PCMPEQB / VPCMPEQB
7989 */
7990#ifdef IEM_WITHOUT_ASSEMBLY
7991
7992IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7993{
7994 RT_NOREF(pFpuState);
7995 RTUINT64U uSrc1 = { *puDst };
7996 RTUINT64U uSrc2 = { *puSrc };
7997 RTUINT64U uDst;
7998 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7999 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
8000 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
8001 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
8002 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
8003 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
8004 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
8005 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
8006 *puDst = uDst.u;
8007}
8008
8009
8010IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8011{
8012 RT_NOREF(pFpuState);
8013 RTUINT128U uSrc1 = *puDst;
8014 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
8015 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
8016 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
8017 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
8018 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
8019 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
8020 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
8021 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
8022 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
8023 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
8024 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
8025 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
8026 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
8027 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
8028 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
8029 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
8030}
8031
8032#endif
8033
8034IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8035 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8036{
8037 RT_NOREF(pExtState);
8038 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8039 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8040 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8041 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8042 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8043 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8044 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8045 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8046 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8047 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8048 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8049 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8050 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8051 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8052 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8053 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8054}
8055
8056IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8057 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8058{
8059 RT_NOREF(pExtState);
8060 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8061 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8062 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8063 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8064 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8065 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8066 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8067 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8068 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8069 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8070 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8071 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8072 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8073 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8074 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8075 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8076 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8077 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8078 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8079 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8080 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8081 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8082 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8083 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8084 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8085 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8086 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8087 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8088 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8089 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8090 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8091 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8092}
8093
8094
8095/*
8096 * PCMPEQW / VPCMPEQW
8097 */
8098#ifdef IEM_WITHOUT_ASSEMBLY
8099
8100IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8101{
8102 RT_NOREF(pFpuState);
8103 RTUINT64U uSrc1 = { *puDst };
8104 RTUINT64U uSrc2 = { *puSrc };
8105 RTUINT64U uDst;
8106 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8107 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8108 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8109 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8110 *puDst = uDst.u;
8111}
8112
8113
8114IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8115{
8116 RT_NOREF(pFpuState);
8117 RTUINT128U uSrc1 = *puDst;
8118 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8119 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8120 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8121 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8122 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8123 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8124 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8125 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8126}
8127
8128#endif
8129
8130IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8131 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8132{
8133 RT_NOREF(pExtState);
8134 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8135 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8136 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8137 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8138 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8139 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8140 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8141 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8142}
8143
8144IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8145 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8146{
8147 RT_NOREF(pExtState);
8148 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8149 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8150 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8151 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8152 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8153 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8154 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8155 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8156 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8157 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8158 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8159 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8160 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8161 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8162 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8163 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8164}
8165
8166
8167/*
8168 * PCMPEQD / VPCMPEQD.
8169 */
8170#ifdef IEM_WITHOUT_ASSEMBLY
8171
8172IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8173{
8174 RT_NOREF(pFpuState);
8175 RTUINT64U uSrc1 = { *puDst };
8176 RTUINT64U uSrc2 = { *puSrc };
8177 RTUINT64U uDst;
8178 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8179 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8180 *puDst = uDst.u;
8181}
8182
8183
8184IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8185{
8186 RT_NOREF(pFpuState);
8187 RTUINT128U uSrc1 = *puDst;
8188 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8189 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8190 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8191 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8192}
8193
8194#endif /* IEM_WITHOUT_ASSEMBLY */
8195
8196IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8197 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8198{
8199 RT_NOREF(pExtState);
8200 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8201 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8202 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8203 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8204}
8205
8206IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8207 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8208{
8209 RT_NOREF(pExtState);
8210 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8211 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8212 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8213 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8214 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8215 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8216 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8217 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8218}
8219
8220
8221/*
8222 * PCMPEQQ / VPCMPEQQ.
8223 */
8224IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8225{
8226 RT_NOREF(pFpuState);
8227 RTUINT128U uSrc1 = *puDst;
8228 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8229 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8230}
8231
8232IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8233 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8234{
8235 RT_NOREF(pExtState);
8236 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8237 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8238}
8239
8240IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8241 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8242{
8243 RT_NOREF(pExtState);
8244 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8245 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8246 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8247 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8248}
8249
8250
8251/*
8252 * PCMPGTB / VPCMPGTB
8253 */
8254#ifdef IEM_WITHOUT_ASSEMBLY
8255
8256IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8257{
8258 RT_NOREF(pFpuState);
8259 RTUINT64U uSrc1 = { *puDst };
8260 RTUINT64U uSrc2 = { *puSrc };
8261 RTUINT64U uDst;
8262 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8263 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8264 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8265 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8266 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8267 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8268 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8269 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8270 *puDst = uDst.u;
8271}
8272
8273
8274IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8275{
8276 RT_NOREF(pFpuState);
8277 RTUINT128U uSrc1 = *puDst;
8278 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8279 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8280 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8281 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8282 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8283 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8284 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8285 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8286 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8287 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8288 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8289 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8290 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8291 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8292 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8293 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8294}
8295
8296#endif
8297
8298IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8299 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8300{
8301 RT_NOREF(pExtState);
8302 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8303 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8304 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8305 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8306 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8307 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8308 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8309 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8310 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8311 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8312 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8313 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8314 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8315 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8316 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8317 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8318}
8319
8320IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8321 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8322{
8323 RT_NOREF(pExtState);
8324 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8325 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8326 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8327 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8328 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8329 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8330 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8331 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8332 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8333 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8334 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8335 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8336 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8337 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8338 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8339 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8340 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8341 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8342 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8343 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8344 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8345 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8346 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8347 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8348 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8349 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8350 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8351 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8352 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8353 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8354 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8355 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8356}
8357
8358
8359/*
8360 * PCMPGTW / VPCMPGTW
8361 */
8362#ifdef IEM_WITHOUT_ASSEMBLY
8363
8364IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8365{
8366 RT_NOREF(pFpuState);
8367 RTUINT64U uSrc1 = { *puDst };
8368 RTUINT64U uSrc2 = { *puSrc };
8369 RTUINT64U uDst;
8370 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8371 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8372 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8373 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8374 *puDst = uDst.u;
8375}
8376
8377
8378IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8379{
8380 RT_NOREF(pFpuState);
8381 RTUINT128U uSrc1 = *puDst;
8382 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8383 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8384 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8385 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8386 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8387 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8388 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8389 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8390}
8391
8392#endif
8393
8394IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8395 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8396{
8397 RT_NOREF(pExtState);
8398 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8399 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8400 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8401 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8402 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8403 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8404 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8405 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8406}
8407
8408IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8409 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8410{
8411 RT_NOREF(pExtState);
8412 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8413 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8414 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8415 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8416 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8417 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8418 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8419 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8420 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8421 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8422 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8423 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8424 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8425 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8426 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8427 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8428}
8429
8430
8431/*
8432 * PCMPGTD / VPCMPGTD.
8433 */
8434#ifdef IEM_WITHOUT_ASSEMBLY
8435
8436IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8437{
8438 RT_NOREF(pFpuState);
8439 RTUINT64U uSrc1 = { *puDst };
8440 RTUINT64U uSrc2 = { *puSrc };
8441 RTUINT64U uDst;
8442 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8443 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8444 *puDst = uDst.u;
8445}
8446
8447
8448IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8449{
8450 RT_NOREF(pFpuState);
8451 RTUINT128U uSrc1 = *puDst;
8452 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8453 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8454 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8455 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8456}
8457
8458#endif /* IEM_WITHOUT_ASSEMBLY */
8459
8460IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8461 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8462{
8463 RT_NOREF(pExtState);
8464 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8465 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8466 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8467 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8468}
8469
8470IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8471 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8472{
8473 RT_NOREF(pExtState);
8474 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8475 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8476 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8477 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8478 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8479 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8480 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8481 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8482}
8483
8484
8485/*
8486 * PCMPGTQ / VPCMPGTQ.
8487 */
8488IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8489{
8490 RT_NOREF(pFpuState);
8491 RTUINT128U uSrc1 = *puDst;
8492 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8493 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8494}
8495
8496IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8497 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8498{
8499 RT_NOREF(pExtState);
8500 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8501 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8502}
8503
8504IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8505 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8506{
8507 RT_NOREF(pExtState);
8508 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8509 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8510 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8511 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8512}
8513
8514
8515/*
8516 * PADDB / VPADDB
8517 */
8518#ifdef IEM_WITHOUT_ASSEMBLY
8519
8520IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8521{
8522 RT_NOREF(pFpuState);
8523 RTUINT64U uSrc1 = { *puDst };
8524 RTUINT64U uSrc2 = { *puSrc };
8525 RTUINT64U uDst;
8526 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8527 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8528 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8529 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8530 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8531 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8532 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8533 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8534 *puDst = uDst.u;
8535}
8536
8537
8538IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8539{
8540 RT_NOREF(pFpuState);
8541 RTUINT128U uSrc1 = *puDst;
8542 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8543 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8544 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8545 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8546 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8547 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8548 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8549 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8550 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8551 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8552 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8553 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8554 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8555 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8556 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8557 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8558}
8559
8560#endif
8561
8562
8563IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8564 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8565{
8566 RT_NOREF(pExtState);
8567 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8568 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8569 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8570 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8571 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8572 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8573 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8574 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8575 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8576 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8577 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8578 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8579 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8580 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8581 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8582 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8583}
8584
8585IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8586 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8587{
8588 RT_NOREF(pExtState);
8589 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8590 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8591 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8592 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8593 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8594 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8595 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8596 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8597 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8598 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8599 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8600 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8601 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8602 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8603 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8604 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8605 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8606 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8607 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8608 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8609 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8610 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8611 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8612 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8613 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8614 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8615 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8616 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8617 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8618 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8619 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8620 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8621}
8622
8623
8624/*
8625 * PADDSB / VPADDSB
8626 */
8627#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8628 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8629 ? (uint8_t)(a_iWord) \
8630 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8631
8632#ifdef IEM_WITHOUT_ASSEMBLY
8633
8634IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8635{
8636 RT_NOREF(pFpuState);
8637 RTUINT64U uSrc1 = { *puDst };
8638 RTUINT64U uSrc2 = { *puSrc };
8639 RTUINT64U uDst;
8640 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8641 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8642 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8643 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8644 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8645 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8646 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8647 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8648 *puDst = uDst.u;
8649}
8650
8651
8652IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8653{
8654 RT_NOREF(pFpuState);
8655 RTUINT128U uSrc1 = *puDst;
8656 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8657 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8658 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8659 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8660 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8661 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8662 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8663 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8664 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8665 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8666 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8667 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8668 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8669 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8670 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8671 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8672}
8673
8674#endif
8675
8676IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8677 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8678{
8679 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8680 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8681 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8682 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8683 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8684 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8685 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8686 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8687 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8688 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8689 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8690 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8691 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8692 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8693 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8694 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8695}
8696
8697IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8698 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8699{
8700 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8701 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8702 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8703 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8704 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8705 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8706 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8707 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8708 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8709 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8710 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8711 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8712 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8713 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8714 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8715 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8716 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8717 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8718 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8719 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8720 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8721 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8722 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8723 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8724 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8725 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8726 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8727 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8728 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8729 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8730 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8731 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8732}
8733
8734
8735/*
8736 * PADDUSB / VPADDUSB
8737 */
8738#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8739 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8740 ? (uint8_t)(a_uWord) \
8741 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8742
8743#ifdef IEM_WITHOUT_ASSEMBLY
8744
8745IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8746{
8747 RT_NOREF(pFpuState);
8748 RTUINT64U uSrc1 = { *puDst };
8749 RTUINT64U uSrc2 = { *puSrc };
8750 RTUINT64U uDst;
8751 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8752 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8753 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8754 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8755 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8756 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8757 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8758 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8759 *puDst = uDst.u;
8760}
8761
8762
8763IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8764{
8765 RT_NOREF(pFpuState);
8766 RTUINT128U uSrc1 = *puDst;
8767 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8768 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8769 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8770 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8771 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8772 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8773 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8774 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8775 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8776 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8777 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8778 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8779 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8780 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8781 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8782 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8783}
8784
8785#endif
8786
8787IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8788 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8789{
8790 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8791 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8792 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8793 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8794 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8795 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8796 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8797 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8798 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8799 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8800 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8801 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8802 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8803 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8804 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8805 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8806}
8807
8808IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8809 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8810{
8811 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8812 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8813 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8814 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8815 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8816 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8817 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8818 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8819 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8820 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8821 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8822 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8823 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8824 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8825 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8826 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8827 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8828 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8829 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8830 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8831 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8832 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8833 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8834 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8835 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8836 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8837 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8838 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8839 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8840 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8841 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8842 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8843}
8844
8845
8846/*
8847 * PADDW / VPADDW
8848 */
8849#ifdef IEM_WITHOUT_ASSEMBLY
8850
8851IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8852{
8853 RT_NOREF(pFpuState);
8854 RTUINT64U uSrc1 = { *puDst };
8855 RTUINT64U uSrc2 = { *puSrc };
8856 RTUINT64U uDst;
8857 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8858 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8859 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8860 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8861 *puDst = uDst.u;
8862}
8863
8864
8865IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8866{
8867 RT_NOREF(pFpuState);
8868 RTUINT128U uSrc1 = *puDst;
8869 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8870 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8871 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8872 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8873 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8874 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8875 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8876 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8877}
8878
8879#endif
8880
8881
8882IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8883 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8884{
8885 RT_NOREF(pExtState);
8886 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8887 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8888 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8889 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8890 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8891 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8892 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8893 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8894}
8895
8896IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8897 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8898{
8899 RT_NOREF(pExtState);
8900 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8901 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8902 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8903 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8904 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8905 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8906 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8907 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8908 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8909 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8910 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8911 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8912 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8913 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8914 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8915 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8916}
8917
8918
8919/*
8920 * PADDSW / VPADDSW
8921 */
8922#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8923 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8924 ? (uint16_t)(a_iDword) \
8925 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8926
8927#ifdef IEM_WITHOUT_ASSEMBLY
8928
8929IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8930{
8931 RT_NOREF(pFpuState);
8932 RTUINT64U uSrc1 = { *puDst };
8933 RTUINT64U uSrc2 = { *puSrc };
8934 RTUINT64U uDst;
8935 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8936 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8937 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8938 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8939 *puDst = uDst.u;
8940}
8941
8942
8943IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8944{
8945 RT_NOREF(pFpuState);
8946 RTUINT128U uSrc1 = *puDst;
8947 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8948 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8949 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8950 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8951 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8952 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8953 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8954 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8955}
8956
8957#endif
8958
8959IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8960 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8961{
8962 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8963 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8964 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8965 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8966 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8967 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8968 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8969 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8970}
8971
8972IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8973 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8974{
8975 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8976 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8977 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8978 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8979 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8980 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8981 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8982 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8983 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8984 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8985 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8986 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8987 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8988 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8989 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8990 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8991}
8992
8993
8994/*
8995 * PADDUSW / VPADDUSW
8996 */
8997#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8998 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8999 ? (uint16_t)(a_uDword) \
9000 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
9001
9002#ifdef IEM_WITHOUT_ASSEMBLY
9003
9004IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9005{
9006 RT_NOREF(pFpuState);
9007 RTUINT64U uSrc1 = { *puDst };
9008 RTUINT64U uSrc2 = { *puSrc };
9009 RTUINT64U uDst;
9010 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
9011 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
9012 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
9013 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
9014 *puDst = uDst.u;
9015}
9016
9017
9018IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9019{
9020 RT_NOREF(pFpuState);
9021 RTUINT128U uSrc1 = *puDst;
9022 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
9023 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
9024 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
9025 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
9026 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
9027 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
9028 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
9029 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
9030}
9031
9032#endif
9033
9034IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
9035 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9036{
9037 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9038 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9039 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9040 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9041 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9042 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9043 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9044 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9045}
9046
9047IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
9048 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9049{
9050 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
9051 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
9052 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
9053 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
9054 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
9055 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
9056 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
9057 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
9058 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
9059 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
9060 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
9061 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
9062 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
9063 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
9064 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
9065 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
9066}
9067
9068
9069/*
9070 * PADDD / VPADDD.
9071 */
9072#ifdef IEM_WITHOUT_ASSEMBLY
9073
9074IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9075{
9076 RT_NOREF(pFpuState);
9077 RTUINT64U uSrc1 = { *puDst };
9078 RTUINT64U uSrc2 = { *puSrc };
9079 RTUINT64U uDst;
9080 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
9081 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
9082 *puDst = uDst.u;
9083}
9084
9085
9086IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9087{
9088 RT_NOREF(pFpuState);
9089 RTUINT128U uSrc1 = *puDst;
9090 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
9091 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
9092 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
9093 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
9094}
9095
9096#endif /* IEM_WITHOUT_ASSEMBLY */
9097
9098IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9099 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9100{
9101 RT_NOREF(pExtState);
9102 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9103 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9104 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9105 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9106}
9107
9108IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9109 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9110{
9111 RT_NOREF(pExtState);
9112 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9113 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9114 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9115 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9116 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
9117 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
9118 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
9119 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
9120}
9121
9122
9123/*
9124 * PADDQ / VPADDQ.
9125 */
9126#ifdef IEM_WITHOUT_ASSEMBLY
9127
9128IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9129{
9130 RT_NOREF(pFpuState);
9131 *puDst = *puDst + *puSrc;
9132}
9133
9134IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9135{
9136 RT_NOREF(pFpuState);
9137 RTUINT128U uSrc1 = *puDst;
9138 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
9139 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
9140}
9141
9142#endif
9143
9144IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9145 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9146{
9147 RT_NOREF(pExtState);
9148 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9149 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9150}
9151
9152IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9153 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9154{
9155 RT_NOREF(pExtState);
9156 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9157 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9158 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9159 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9160}
9161
9162
9163/*
9164 * PSUBB / VPSUBB
9165 */
9166#ifdef IEM_WITHOUT_ASSEMBLY
9167
9168IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9169{
9170 RT_NOREF(pFpuState);
9171 RTUINT64U uSrc1 = { *puDst };
9172 RTUINT64U uSrc2 = { *puSrc };
9173 RTUINT64U uDst;
9174 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9175 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9176 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9177 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9178 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9179 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9180 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9181 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9182 *puDst = uDst.u;
9183}
9184
9185
9186IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9187{
9188 RT_NOREF(pFpuState);
9189 RTUINT128U uSrc1 = *puDst;
9190 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9191 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9192 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9193 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9194 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9195 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9196 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9197 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9198 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9199 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9200 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9201 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9202 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9203 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9204 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9205 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9206}
9207
9208#endif
9209
9210IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9211 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9212{
9213 RT_NOREF(pExtState);
9214 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9215 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9216 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9217 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9218 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9219 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9220 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9221 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9222 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9223 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9224 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9225 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9226 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9227 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9228 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9229 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9230}
9231
9232IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9233 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9234{
9235 RT_NOREF(pExtState);
9236 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9237 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9238 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9239 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9240 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9241 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9242 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9243 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9244 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9245 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9246 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9247 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9248 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9249 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9250 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9251 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9252 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9253 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9254 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9255 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9256 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9257 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9258 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9259 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9260 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9261 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9262 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9263 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9264 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9265 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9266 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9267 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9268}
9269
9270
9271/*
9272 * PSUBSB / VSUBSB
9273 */
9274#ifdef IEM_WITHOUT_ASSEMBLY
9275
9276IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9277{
9278 RT_NOREF(pFpuState);
9279 RTUINT64U uSrc1 = { *puDst };
9280 RTUINT64U uSrc2 = { *puSrc };
9281 RTUINT64U uDst;
9282 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9283 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9284 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9285 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9286 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9287 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9288 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9289 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9290 *puDst = uDst.u;
9291}
9292
9293
9294IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9295{
9296 RT_NOREF(pFpuState);
9297 RTUINT128U uSrc1 = *puDst;
9298 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9299 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9300 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9301 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9302 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9303 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9304 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9305 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9306 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9307 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9308 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9309 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9310 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9311 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9312 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9313 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9314}
9315
9316#endif
9317
9318IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9319 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9320{
9321 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9322 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9323 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9324 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9325 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9326 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9327 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9328 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9329 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9330 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9331 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9332 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9333 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9334 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9335 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9336 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9337}
9338
9339IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9340 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9341{
9342 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9343 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9344 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9345 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9346 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9347 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9348 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9349 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9350 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9351 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9352 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9353 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9354 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9355 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9356 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9357 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9358 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9359 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9360 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9361 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9362 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9363 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9364 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9365 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9366 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9367 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9368 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9369 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9370 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9371 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9372 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9373 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9374}
9375
9376
9377/*
9378 * PSUBUSB / VPSUBUSW
9379 */
9380#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9381 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9382 ? (uint8_t)(a_uWord) \
9383 : (uint8_t)0 )
9384
9385#ifdef IEM_WITHOUT_ASSEMBLY
9386
9387IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9388{
9389 RT_NOREF(pFpuState);
9390 RTUINT64U uSrc1 = { *puDst };
9391 RTUINT64U uSrc2 = { *puSrc };
9392 RTUINT64U uDst;
9393 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9394 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9395 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9396 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9397 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9398 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9399 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9400 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9401 *puDst = uDst.u;
9402}
9403
9404
9405IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9406{
9407 RT_NOREF(pFpuState);
9408 RTUINT128U uSrc1 = *puDst;
9409 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9410 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9411 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9412 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9413 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9414 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9415 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9416 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9417 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9418 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9419 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9420 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9421 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9422 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9423 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9424 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9425}
9426
9427#endif
9428
9429IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9430 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9431{
9432 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9433 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9434 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9435 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9436 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9437 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9438 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9439 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9440 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9441 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9442 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9443 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9444 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9445 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9446 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9447 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9448}
9449
9450IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9451 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9452{
9453 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9454 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9455 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9456 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9457 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9458 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9459 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9460 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9461 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9462 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9463 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9464 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9465 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9466 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9467 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9468 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9469 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9470 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9471 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9472 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9473 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9474 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9475 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9476 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9477 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9478 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9479 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9480 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9481 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9482 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9483 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9484 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9485}
9486
9487
9488/*
9489 * PSUBW / VPSUBW
9490 */
9491#ifdef IEM_WITHOUT_ASSEMBLY
9492
9493IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9494{
9495 RT_NOREF(pFpuState);
9496 RTUINT64U uSrc1 = { *puDst };
9497 RTUINT64U uSrc2 = { *puSrc };
9498 RTUINT64U uDst;
9499 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9500 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9501 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9502 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9503 *puDst = uDst.u;
9504}
9505
9506
9507IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9508{
9509 RT_NOREF(pFpuState);
9510 RTUINT128U uSrc1 = *puDst;
9511 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9512 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9513 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9514 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9515 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9516 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9517 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9518 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9519}
9520
9521#endif
9522
9523IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9524 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9525{
9526 RT_NOREF(pExtState);
9527 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9528 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9529 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9530 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9531 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9532 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9533 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9534 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9535}
9536
9537IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9538 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9539{
9540 RT_NOREF(pExtState);
9541 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9542 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9543 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9544 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9545 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9546 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9547 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9548 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9549 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9550 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9551 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9552 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9553 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9554 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9555 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9556 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9557}
9558
9559
9560/*
9561 * PSUBSW / VPSUBSW
9562 */
9563#ifdef IEM_WITHOUT_ASSEMBLY
9564
9565IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9566{
9567 RT_NOREF(pFpuState);
9568 RTUINT64U uSrc1 = { *puDst };
9569 RTUINT64U uSrc2 = { *puSrc };
9570 RTUINT64U uDst;
9571 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9572 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9573 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9574 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9575 *puDst = uDst.u;
9576}
9577
9578
9579IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9580{
9581 RT_NOREF(pFpuState);
9582 RTUINT128U uSrc1 = *puDst;
9583 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9584 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9585 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9586 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9587 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9588 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9589 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9590 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9591}
9592
9593#endif
9594
9595IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9596 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9597{
9598 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9599 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9600 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9601 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9602 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9603 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9604 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9605 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9606}
9607
9608IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9609 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9610{
9611 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9612 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9613 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9614 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9615 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9616 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9617 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9618 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9619 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9620 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9621 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9622 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9623 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9624 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9625 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9626 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9627}
9628
9629
9630/*
9631 * PSUBUSW / VPSUBUSW
9632 */
9633#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9634 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9635 ? (uint16_t)(a_uDword) \
9636 : (uint16_t)0 )
9637
9638#ifdef IEM_WITHOUT_ASSEMBLY
9639
9640IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9641{
9642 RT_NOREF(pFpuState);
9643 RTUINT64U uSrc1 = { *puDst };
9644 RTUINT64U uSrc2 = { *puSrc };
9645 RTUINT64U uDst;
9646 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9647 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9648 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9649 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9650 *puDst = uDst.u;
9651}
9652
9653
9654IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9655{
9656 RT_NOREF(pFpuState);
9657 RTUINT128U uSrc1 = *puDst;
9658 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9659 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9660 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9661 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9662 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9663 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9664 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9665 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9666}
9667
9668#endif
9669
9670IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9671 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9672{
9673 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9674 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9675 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9676 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9677 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9678 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9679 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9680 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9681}
9682
9683IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9684 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9685{
9686 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9687 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9688 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9689 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9690 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9691 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9692 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9693 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9694 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9695 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9696 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9697 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9698 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9699 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9700 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9701 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9702}
9703
9704
9705
9706/*
9707 * PSUBD / VPSUBD.
9708 */
9709#ifdef IEM_WITHOUT_ASSEMBLY
9710
9711IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9712{
9713 RT_NOREF(pFpuState);
9714 RTUINT64U uSrc1 = { *puDst };
9715 RTUINT64U uSrc2 = { *puSrc };
9716 RTUINT64U uDst;
9717 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9718 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9719 *puDst = uDst.u;
9720}
9721
9722
9723IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9724{
9725 RT_NOREF(pFpuState);
9726 RTUINT128U uSrc1 = *puDst;
9727 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9728 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9729 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9730 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9731}
9732
9733#endif /* IEM_WITHOUT_ASSEMBLY */
9734
9735IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9736 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9737{
9738 RT_NOREF(pExtState);
9739 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9740 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9741 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9742 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9743}
9744
9745IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9746 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9747{
9748 RT_NOREF(pExtState);
9749 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9750 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9751 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9752 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9753 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9754 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9755 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9756 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9757}
9758
9759
9760/*
9761 * PSUBQ / VPSUBQ.
9762 */
9763#ifdef IEM_WITHOUT_ASSEMBLY
9764
9765IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9766{
9767 RT_NOREF(pFpuState);
9768 *puDst = *puDst - *puSrc;
9769}
9770
9771IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9772{
9773 RT_NOREF(pFpuState);
9774 RTUINT128U uSrc1 = *puDst;
9775 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9776 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9777}
9778
9779#endif
9780
9781IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9782 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9783{
9784 RT_NOREF(pExtState);
9785 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9786 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9787}
9788
9789IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9790 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9791{
9792 RT_NOREF(pExtState);
9793 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9794 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9795 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9796 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9797}
9798
9799
9800
9801/*
9802 * PMULLW / VPMULLW / PMULLD / VPMULLD
9803 */
9804#ifdef IEM_WITHOUT_ASSEMBLY
9805
9806IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9807{
9808 RT_NOREF(pFpuState);
9809 RTUINT64U uSrc1 = { *puDst };
9810 RTUINT64U uSrc2 = { *puSrc };
9811 RTUINT64U uDst;
9812 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9813 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9814 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9815 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9816 *puDst = uDst.u;
9817}
9818
9819
9820IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9821{
9822 RT_NOREF(pFpuState);
9823 RTUINT128U uSrc1 = *puDst;
9824 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9825 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9826 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9827 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9828 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9829 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9830 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9831 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9832}
9833
9834#endif
9835
9836IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9837{
9838 RTUINT128U uSrc1 = *puDst;
9839
9840 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9841 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9842 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9843 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9844 RT_NOREF(pFpuState);
9845}
9846
9847
9848IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9849{
9850 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9851 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9852 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9853 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9854 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9855 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9856 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9857 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9858}
9859
9860
9861IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9862{
9863 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9864 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9865 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9866 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9867 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9868 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9869 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9870 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9871 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9872 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9873 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9874 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9875 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9876 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9877 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9878 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9879}
9880
9881
9882IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9883{
9884 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9885 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9886 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9887 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9888}
9889
9890
9891IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9892{
9893 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9894 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9895 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9896 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9897 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9898 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9899 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9900 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9901}
9902
9903
9904/*
9905 * PMULHW / VPMULHW
9906 */
9907#ifdef IEM_WITHOUT_ASSEMBLY
9908
9909IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9910{
9911 RT_NOREF(pFpuState);
9912 RTUINT64U uSrc1 = { *puDst };
9913 RTUINT64U uSrc2 = { *puSrc };
9914 RTUINT64U uDst;
9915 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9916 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9917 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9918 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9919 *puDst = uDst.u;
9920}
9921
9922
9923IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9924{
9925 RT_NOREF(pFpuState);
9926 RTUINT128U uSrc1 = *puDst;
9927 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9928 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9929 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9930 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9931 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9932 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9933 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9934 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9935}
9936
9937#endif
9938
9939IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9940{
9941 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9942 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9943 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9944 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9945 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9946 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9947 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9948 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9949}
9950
9951
9952IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9953{
9954 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9955 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9956 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9957 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9958 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9959 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9960 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9961 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9962 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9963 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9964 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9965 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9966 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9967 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9968 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9969 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9970}
9971
9972
9973/*
9974 * PMULHUW / VPMULHUW
9975 */
9976#ifdef IEM_WITHOUT_ASSEMBLY
9977
9978IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9979{
9980 RTUINT64U uSrc1 = { *puDst };
9981 RTUINT64U uSrc2 = { *puSrc };
9982 RTUINT64U uDst;
9983 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9984 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9985 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9986 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9987 *puDst = uDst.u;
9988}
9989
9990
9991IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9992{
9993 RTUINT128U uSrc1 = *puDst;
9994 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9995 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9996 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9997 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9998 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9999 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
10000 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
10001 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
10002}
10003
10004#endif
10005
10006IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10007{
10008 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
10009 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
10010 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
10011 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
10012 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
10013 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
10014 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
10015 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
10016}
10017
10018
10019IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10020{
10021 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
10022 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
10023 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
10024 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
10025 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
10026 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
10027 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
10028 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
10029 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
10030 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
10031 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
10032 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
10033 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
10034 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
10035 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
10036 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
10037}
10038
10039
10040/*
10041 * PSRLW / VPSRLW
10042 */
10043#ifdef IEM_WITHOUT_ASSEMBLY
10044
10045IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10046{
10047 RTUINT64U uSrc1 = { *puDst };
10048 RTUINT64U uSrc2 = { *puSrc };
10049 RTUINT64U uDst;
10050
10051 if (uSrc2.au64[0] <= 15)
10052 {
10053 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
10054 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
10055 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
10056 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
10057 }
10058 else
10059 {
10060 uDst.au64[0] = 0;
10061 }
10062 *puDst = uDst.u;
10063}
10064
10065
10066IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10067{
10068 RTUINT64U uSrc1 = { *puDst };
10069 RTUINT64U uDst;
10070
10071 if (uShift <= 15)
10072 {
10073 uDst.au16[0] = uSrc1.au16[0] >> uShift;
10074 uDst.au16[1] = uSrc1.au16[1] >> uShift;
10075 uDst.au16[2] = uSrc1.au16[2] >> uShift;
10076 uDst.au16[3] = uSrc1.au16[3] >> uShift;
10077 }
10078 else
10079 {
10080 uDst.au64[0] = 0;
10081 }
10082 *puDst = uDst.u;
10083}
10084
10085
10086IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10087{
10088 RTUINT128U uSrc1 = *puDst;
10089
10090 if (puSrc->au64[0] <= 15)
10091 {
10092 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
10093 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
10094 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
10095 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
10096 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
10097 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
10098 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
10099 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
10100 }
10101 else
10102 {
10103 puDst->au64[0] = 0;
10104 puDst->au64[1] = 0;
10105 }
10106}
10107
10108IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10109{
10110 RTUINT128U uSrc1 = *puDst;
10111
10112 if (uShift <= 15)
10113 {
10114 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10115 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10116 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10117 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10118 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10119 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10120 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10121 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10122 }
10123 else
10124 {
10125 puDst->au64[0] = 0;
10126 puDst->au64[1] = 0;
10127 }
10128}
10129
10130#endif
10131
10132
10133/*
10134 * PSRAW / VPSRAW
10135 */
10136#ifdef IEM_WITHOUT_ASSEMBLY
10137
10138IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10139{
10140 RTUINT64U uSrc1 = { *puDst };
10141 RTUINT64U uSrc2 = { *puSrc };
10142 RTUINT64U uDst;
10143
10144 if (uSrc2.au64[0] <= 15)
10145 {
10146 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
10147 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
10148 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
10149 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
10150 }
10151 else
10152 {
10153 uDst.au64[0] = 0;
10154 }
10155 *puDst = uDst.u;
10156}
10157
10158
10159IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10160{
10161 RTUINT64U uSrc1 = { *puDst };
10162 RTUINT64U uDst;
10163
10164 if (uShift <= 15)
10165 {
10166 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10167 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10168 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10169 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10170 }
10171 else
10172 {
10173 uDst.au64[0] = 0;
10174 }
10175 *puDst = uDst.u;
10176}
10177
10178
10179IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10180{
10181 RTUINT128U uSrc1 = *puDst;
10182
10183 if (puSrc->au64[0] <= 15)
10184 {
10185 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
10186 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
10187 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
10188 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
10189 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
10190 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
10191 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
10192 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
10193 }
10194 else
10195 {
10196 puDst->au64[0] = 0;
10197 puDst->au64[1] = 0;
10198 }
10199}
10200
10201IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10202{
10203 RTUINT128U uSrc1 = *puDst;
10204
10205 if (uShift <= 15)
10206 {
10207 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10208 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10209 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10210 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10211 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10212 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10213 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10214 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10215 }
10216 else
10217 {
10218 puDst->au64[0] = 0;
10219 puDst->au64[1] = 0;
10220 }
10221}
10222
10223#endif
10224
10225
10226/*
10227 * PSLLW / VPSLLW
10228 */
10229#ifdef IEM_WITHOUT_ASSEMBLY
10230
10231IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10232{
10233 RTUINT64U uSrc1 = { *puDst };
10234 RTUINT64U uSrc2 = { *puSrc };
10235 RTUINT64U uDst;
10236
10237 if (uSrc2.au64[0] <= 15)
10238 {
10239 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10240 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10241 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10242 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10243 }
10244 else
10245 {
10246 uDst.au64[0] = 0;
10247 }
10248 *puDst = uDst.u;
10249}
10250
10251
10252IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10253{
10254 RTUINT64U uSrc1 = { *puDst };
10255 RTUINT64U uDst;
10256
10257 if (uShift <= 15)
10258 {
10259 uDst.au16[0] = uSrc1.au16[0] << uShift;
10260 uDst.au16[1] = uSrc1.au16[1] << uShift;
10261 uDst.au16[2] = uSrc1.au16[2] << uShift;
10262 uDst.au16[3] = uSrc1.au16[3] << uShift;
10263 }
10264 else
10265 {
10266 uDst.au64[0] = 0;
10267 }
10268 *puDst = uDst.u;
10269}
10270
10271
10272IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10273{
10274 RTUINT128U uSrc1 = *puDst;
10275
10276 if (puSrc->au64[0] <= 15)
10277 {
10278 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10279 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10280 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10281 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10282 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10283 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10284 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10285 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10286 }
10287 else
10288 {
10289 puDst->au64[0] = 0;
10290 puDst->au64[1] = 0;
10291 }
10292}
10293
10294IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10295{
10296 RTUINT128U uSrc1 = *puDst;
10297
10298 if (uShift <= 15)
10299 {
10300 puDst->au16[0] = uSrc1.au16[0] << uShift;
10301 puDst->au16[1] = uSrc1.au16[1] << uShift;
10302 puDst->au16[2] = uSrc1.au16[2] << uShift;
10303 puDst->au16[3] = uSrc1.au16[3] << uShift;
10304 puDst->au16[4] = uSrc1.au16[4] << uShift;
10305 puDst->au16[5] = uSrc1.au16[5] << uShift;
10306 puDst->au16[6] = uSrc1.au16[6] << uShift;
10307 puDst->au16[7] = uSrc1.au16[7] << uShift;
10308 }
10309 else
10310 {
10311 puDst->au64[0] = 0;
10312 puDst->au64[1] = 0;
10313 }
10314}
10315
10316#endif
10317
10318IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10319{
10320 RTUINT128U uSrc1 = *puSrc1;
10321
10322 if (uShift <= 15)
10323 {
10324 puDst->au16[0] = uSrc1.au16[0] << uShift;
10325 puDst->au16[1] = uSrc1.au16[1] << uShift;
10326 puDst->au16[2] = uSrc1.au16[2] << uShift;
10327 puDst->au16[3] = uSrc1.au16[3] << uShift;
10328 puDst->au16[4] = uSrc1.au16[4] << uShift;
10329 puDst->au16[5] = uSrc1.au16[5] << uShift;
10330 puDst->au16[6] = uSrc1.au16[6] << uShift;
10331 puDst->au16[7] = uSrc1.au16[7] << uShift;
10332 }
10333 else
10334 {
10335 puDst->au64[0] = 0;
10336 puDst->au64[1] = 0;
10337 }
10338}
10339
10340IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10341{
10342 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, puSrc2->au8[0]);
10343}
10344
10345IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10346{
10347 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, uShift);
10348}
10349
10350IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10351{
10352 RTUINT256U uSrc1 = *puSrc1;
10353
10354 if (uShift <= 15)
10355 {
10356 puDst->au16[0] = uSrc1.au16[0] << uShift;
10357 puDst->au16[1] = uSrc1.au16[1] << uShift;
10358 puDst->au16[2] = uSrc1.au16[2] << uShift;
10359 puDst->au16[3] = uSrc1.au16[3] << uShift;
10360 puDst->au16[4] = uSrc1.au16[4] << uShift;
10361 puDst->au16[5] = uSrc1.au16[5] << uShift;
10362 puDst->au16[6] = uSrc1.au16[6] << uShift;
10363 puDst->au16[7] = uSrc1.au16[7] << uShift;
10364 puDst->au16[8] = uSrc1.au16[8] << uShift;
10365 puDst->au16[9] = uSrc1.au16[9] << uShift;
10366 puDst->au16[10] = uSrc1.au16[10] << uShift;
10367 puDst->au16[11] = uSrc1.au16[11] << uShift;
10368 puDst->au16[12] = uSrc1.au16[12] << uShift;
10369 puDst->au16[13] = uSrc1.au16[13] << uShift;
10370 puDst->au16[14] = uSrc1.au16[14] << uShift;
10371 puDst->au16[15] = uSrc1.au16[15] << uShift;
10372 }
10373 else
10374 {
10375 puDst->au64[0] = 0;
10376 puDst->au64[1] = 0;
10377 puDst->au64[2] = 0;
10378 puDst->au64[3] = 0;
10379 }
10380}
10381
10382IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10383{
10384 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, puSrc2->au8[0]);
10385}
10386
10387IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10388{
10389 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, uShift);
10390}
10391
10392/*
10393 * PSRLD / VPSRLD
10394 */
10395#ifdef IEM_WITHOUT_ASSEMBLY
10396
10397IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10398{
10399 RTUINT64U uSrc1 = { *puDst };
10400 RTUINT64U uSrc2 = { *puSrc };
10401 RTUINT64U uDst;
10402
10403 if (uSrc2.au64[0] <= 31)
10404 {
10405 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10406 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10407 }
10408 else
10409 {
10410 uDst.au64[0] = 0;
10411 }
10412 *puDst = uDst.u;
10413}
10414
10415
10416IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10417{
10418 RTUINT64U uSrc1 = { *puDst };
10419 RTUINT64U uDst;
10420
10421 if (uShift <= 31)
10422 {
10423 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10424 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10425 }
10426 else
10427 {
10428 uDst.au64[0] = 0;
10429 }
10430 *puDst = uDst.u;
10431}
10432
10433
10434IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10435{
10436 RTUINT128U uSrc1 = *puDst;
10437
10438 if (puSrc->au64[0] <= 31)
10439 {
10440 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10441 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10442 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10443 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10444 }
10445 else
10446 {
10447 puDst->au64[0] = 0;
10448 puDst->au64[1] = 0;
10449 }
10450}
10451
10452IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10453{
10454 RTUINT128U uSrc1 = *puDst;
10455
10456 if (uShift <= 31)
10457 {
10458 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10459 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10460 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10461 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10462 }
10463 else
10464 {
10465 puDst->au64[0] = 0;
10466 puDst->au64[1] = 0;
10467 }
10468}
10469
10470#endif
10471
10472
10473/*
10474 * PSRAD / VPSRAD
10475 */
10476#ifdef IEM_WITHOUT_ASSEMBLY
10477
10478IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10479{
10480 RTUINT64U uSrc1 = { *puDst };
10481 RTUINT64U uSrc2 = { *puSrc };
10482 RTUINT64U uDst;
10483
10484 if (uSrc2.au64[0] <= 31)
10485 {
10486 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
10487 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
10488 }
10489 else
10490 {
10491 uDst.au64[0] = 0;
10492 }
10493 *puDst = uDst.u;
10494}
10495
10496
10497IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10498{
10499 RTUINT64U uSrc1 = { *puDst };
10500 RTUINT64U uDst;
10501
10502 if (uShift <= 31)
10503 {
10504 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10505 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10506 }
10507 else
10508 {
10509 uDst.au64[0] = 0;
10510 }
10511 *puDst = uDst.u;
10512}
10513
10514
10515IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10516{
10517 RTUINT128U uSrc1 = *puDst;
10518
10519 if (puSrc->au64[0] <= 31)
10520 {
10521 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
10522 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
10523 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
10524 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
10525 }
10526 else
10527 {
10528 puDst->au64[0] = 0;
10529 puDst->au64[1] = 0;
10530 }
10531}
10532
10533IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10534{
10535 RTUINT128U uSrc1 = *puDst;
10536
10537 if (uShift <= 31)
10538 {
10539 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10540 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10541 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10542 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10543 }
10544 else
10545 {
10546 puDst->au64[0] = 0;
10547 puDst->au64[1] = 0;
10548 }
10549}
10550
10551#endif
10552
10553
10554/*
10555 * PSLLD / VPSLLD
10556 */
10557#ifdef IEM_WITHOUT_ASSEMBLY
10558
10559IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10560{
10561 RTUINT64U uSrc1 = { *puDst };
10562 RTUINT64U uSrc2 = { *puSrc };
10563 RTUINT64U uDst;
10564
10565 if (uSrc2.au64[0] <= 31)
10566 {
10567 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10568 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10569 }
10570 else
10571 {
10572 uDst.au64[0] = 0;
10573 }
10574 *puDst = uDst.u;
10575}
10576
10577
10578IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10579{
10580 RTUINT64U uSrc1 = { *puDst };
10581 RTUINT64U uDst;
10582
10583 if (uShift <= 31)
10584 {
10585 uDst.au32[0] = uSrc1.au32[0] << uShift;
10586 uDst.au32[1] = uSrc1.au32[1] << uShift;
10587 }
10588 else
10589 {
10590 uDst.au64[0] = 0;
10591 }
10592 *puDst = uDst.u;
10593}
10594
10595
10596IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10597{
10598 RTUINT128U uSrc1 = *puDst;
10599
10600 if (puSrc->au64[0] <= 31)
10601 {
10602 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10603 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10604 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10605 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10606 }
10607 else
10608 {
10609 puDst->au64[0] = 0;
10610 puDst->au64[1] = 0;
10611 }
10612}
10613
10614IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10615{
10616 RTUINT128U uSrc1 = *puDst;
10617
10618 if (uShift <= 31)
10619 {
10620 puDst->au32[0] = uSrc1.au32[0] << uShift;
10621 puDst->au32[1] = uSrc1.au32[1] << uShift;
10622 puDst->au32[2] = uSrc1.au32[2] << uShift;
10623 puDst->au32[3] = uSrc1.au32[3] << uShift;
10624 }
10625 else
10626 {
10627 puDst->au64[0] = 0;
10628 puDst->au64[1] = 0;
10629 }
10630}
10631
10632#endif
10633
10634IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10635{
10636 RTUINT128U uSrc1 = *puSrc1;
10637
10638 if (uShift <= 31)
10639 {
10640 puDst->au32[0] = uSrc1.au32[0] << uShift;
10641 puDst->au32[1] = uSrc1.au32[1] << uShift;
10642 puDst->au32[2] = uSrc1.au32[2] << uShift;
10643 puDst->au32[3] = uSrc1.au32[3] << uShift;
10644 }
10645 else
10646 {
10647 puDst->au64[0] = 0;
10648 puDst->au64[1] = 0;
10649 }
10650}
10651
10652IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10653{
10654 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, uShift);
10655}
10656
10657IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10658{
10659 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, puSrc2->au8[0]);
10660}
10661
10662IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10663{
10664 RTUINT256U uSrc1 = *puSrc1;
10665
10666 if (uShift <= 31)
10667 {
10668 puDst->au32[0] = uSrc1.au32[0] << uShift;
10669 puDst->au32[1] = uSrc1.au32[1] << uShift;
10670 puDst->au32[2] = uSrc1.au32[2] << uShift;
10671 puDst->au32[3] = uSrc1.au32[3] << uShift;
10672 puDst->au32[4] = uSrc1.au32[4] << uShift;
10673 puDst->au32[5] = uSrc1.au32[5] << uShift;
10674 puDst->au32[6] = uSrc1.au32[6] << uShift;
10675 puDst->au32[7] = uSrc1.au32[7] << uShift;
10676 }
10677 else
10678 {
10679 puDst->au64[0] = 0;
10680 puDst->au64[1] = 0;
10681 puDst->au64[2] = 0;
10682 puDst->au64[3] = 0;
10683 }
10684}
10685
10686IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10687{
10688 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, puSrc2->au8[0]);
10689}
10690
10691IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10692{
10693 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, uShift);
10694}
10695
10696
10697/*
10698 * PSRLQ / VPSRLQ
10699 */
10700#ifdef IEM_WITHOUT_ASSEMBLY
10701
10702IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10703{
10704 RTUINT64U uSrc1 = { *puDst };
10705 RTUINT64U uSrc2 = { *puSrc };
10706 RTUINT64U uDst;
10707
10708 if (uSrc2.au64[0] <= 63)
10709 {
10710 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10711 }
10712 else
10713 {
10714 uDst.au64[0] = 0;
10715 }
10716 *puDst = uDst.u;
10717}
10718
10719
10720IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10721{
10722 RTUINT64U uSrc1 = { *puDst };
10723 RTUINT64U uDst;
10724
10725 if (uShift <= 63)
10726 {
10727 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10728 }
10729 else
10730 {
10731 uDst.au64[0] = 0;
10732 }
10733 *puDst = uDst.u;
10734}
10735
10736
10737IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10738{
10739 RTUINT128U uSrc1 = *puDst;
10740
10741 if (puSrc->au64[0] <= 63)
10742 {
10743 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10744 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10745 }
10746 else
10747 {
10748 puDst->au64[0] = 0;
10749 puDst->au64[1] = 0;
10750 }
10751}
10752
10753IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10754{
10755 RTUINT128U uSrc1 = *puDst;
10756
10757 if (uShift <= 63)
10758 {
10759 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10760 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10761 }
10762 else
10763 {
10764 puDst->au64[0] = 0;
10765 puDst->au64[1] = 0;
10766 }
10767}
10768
10769#endif
10770
10771
10772/*
10773 * PSLLQ / VPSLLQ
10774 */
10775#ifdef IEM_WITHOUT_ASSEMBLY
10776
10777IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10778{
10779 RTUINT64U uSrc1 = { *puDst };
10780 RTUINT64U uSrc2 = { *puSrc };
10781 RTUINT64U uDst;
10782
10783 if (uSrc2.au64[0] <= 63)
10784 {
10785 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10786 }
10787 else
10788 {
10789 uDst.au64[0] = 0;
10790 }
10791 *puDst = uDst.u;
10792}
10793
10794
10795IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10796{
10797 RTUINT64U uSrc1 = { *puDst };
10798 RTUINT64U uDst;
10799
10800 if (uShift <= 63)
10801 {
10802 uDst.au64[0] = uSrc1.au64[0] << uShift;
10803 }
10804 else
10805 {
10806 uDst.au64[0] = 0;
10807 }
10808 *puDst = uDst.u;
10809}
10810
10811
10812IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10813{
10814 RTUINT128U uSrc1 = *puDst;
10815
10816 if (puSrc->au64[0] <= 63)
10817 {
10818 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10819 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10820 }
10821 else
10822 {
10823 puDst->au64[0] = 0;
10824 puDst->au64[1] = 0;
10825 }
10826}
10827
10828IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10829{
10830 RTUINT128U uSrc1 = *puDst;
10831
10832 if (uShift <= 63)
10833 {
10834 puDst->au64[0] = uSrc1.au64[0] << uShift;
10835 puDst->au64[1] = uSrc1.au64[1] << uShift;
10836 }
10837 else
10838 {
10839 puDst->au64[0] = 0;
10840 puDst->au64[1] = 0;
10841 }
10842}
10843
10844#endif
10845
10846IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10847{
10848 RTUINT128U uSrc1 = *puSrc1;
10849
10850 if (uShift <= 63)
10851 {
10852 puDst->au64[0] = uSrc1.au64[0] << uShift;
10853 puDst->au64[1] = uSrc1.au64[1] << uShift;
10854 }
10855 else
10856 {
10857 puDst->au64[0] = 0;
10858 puDst->au64[1] = 0;
10859 }
10860}
10861
10862IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10863{
10864 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, puSrc2->au8[0]);
10865}
10866
10867IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10868{
10869 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, uShift);
10870}
10871
10872IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10873{
10874 RTUINT256U uSrc1 = *puSrc1;
10875
10876 if (uShift <= 63)
10877 {
10878 puDst->au64[0] = uSrc1.au64[0] << uShift;
10879 puDst->au64[1] = uSrc1.au64[1] << uShift;
10880 puDst->au64[2] = uSrc1.au64[2] << uShift;
10881 puDst->au64[3] = uSrc1.au64[3] << uShift;
10882 }
10883 else
10884 {
10885 puDst->au64[0] = 0;
10886 puDst->au64[1] = 0;
10887 puDst->au64[2] = 0;
10888 puDst->au64[3] = 0;
10889 }
10890}
10891
10892IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10893{
10894 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, puSrc2->au8[0]);
10895}
10896
10897IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10898{
10899 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, uShift);
10900}
10901
10902
10903/*
10904 * PSRLDQ / VPSRLDQ
10905 */
10906#ifdef IEM_WITHOUT_ASSEMBLY
10907
10908IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10909{
10910 RTUINT128U uSrc1 = *puDst;
10911
10912 if (uShift < 16)
10913 {
10914 int i;
10915
10916 for (i = 0; i < 16 - uShift; ++i)
10917 puDst->au8[i] = uSrc1.au8[i + uShift];
10918 for (i = 16 - uShift; i < 16; ++i)
10919 puDst->au8[i] = 0;
10920 }
10921 else
10922 {
10923 puDst->au64[0] = 0;
10924 puDst->au64[1] = 0;
10925 }
10926}
10927
10928#endif
10929
10930
10931/*
10932 * PSLLDQ / VPSLLDQ
10933 */
10934#ifdef IEM_WITHOUT_ASSEMBLY
10935
10936IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10937{
10938 RTUINT128U uSrc1 = *puDst;
10939
10940 if (uShift < 16)
10941 {
10942 int i;
10943
10944 for (i = 0; i < uShift; ++i)
10945 puDst->au8[i] = 0;
10946 for (i = uShift; i < 16; ++i)
10947 puDst->au8[i] = uSrc1.au8[i - uShift];
10948 }
10949 else
10950 {
10951 puDst->au64[0] = 0;
10952 puDst->au64[1] = 0;
10953 }
10954}
10955
10956#endif
10957
10958
10959/*
10960 * PMADDWD / VPMADDWD
10961 */
10962#ifdef IEM_WITHOUT_ASSEMBLY
10963
10964IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10965{
10966 RTUINT64U uSrc1 = { *puDst };
10967 RTUINT64U uSrc2 = { *puSrc };
10968 RTUINT64U uDst;
10969
10970 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10971 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10972 *puDst = uDst.u;
10973 RT_NOREF(pFpuState);
10974}
10975
10976
10977IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10978{
10979 RTUINT128U uSrc1 = *puDst;
10980
10981 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10982 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10983 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10984 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10985 RT_NOREF(pFpuState);
10986}
10987
10988#endif
10989
10990
10991/*
10992 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10993 */
10994#ifdef IEM_WITHOUT_ASSEMBLY
10995
10996IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10997{
10998 RTUINT64U uSrc1 = { *puDst };
10999 RTUINT64U uSrc2 = { *puSrc };
11000 RTUINT64U uDst;
11001
11002 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
11003 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
11004 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
11005 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
11006 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
11007 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
11008 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
11009 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
11010 *puDst = uDst.u;
11011 RT_NOREF(pFpuState);
11012}
11013
11014
11015IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11016{
11017 RTUINT128U uSrc1 = *puDst;
11018
11019 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
11020 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
11021 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
11022 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
11023 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
11024 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
11025 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
11026 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
11027 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
11028 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
11029 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
11030 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
11031 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
11032 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
11033 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
11034 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
11035 RT_NOREF(pFpuState);
11036}
11037
11038#endif
11039
11040
11041IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11042{
11043 RTUINT128U uSrc1 = *puDst;
11044
11045 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
11046 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
11047 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
11048 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
11049 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
11050 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
11051 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
11052 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
11053 RT_NOREF(pFpuState);
11054}
11055
11056
11057IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11058{
11059 RTUINT128U uSrc1 = *puDst;
11060
11061 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
11062 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
11063 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
11064 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
11065 RT_NOREF(pFpuState);
11066}
11067
11068
11069IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11070 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11071{
11072 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11073 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11074 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11075 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11076 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11077 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11078 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11079 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11080 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11081 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11082 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11083 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11084 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11085 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11086 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11087 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11088 RT_NOREF(pExtState);
11089}
11090
11091
11092IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11093 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11094{
11095 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11096 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11097 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11098 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11099 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11100 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11101 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11102 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11103 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11104 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11105 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11106 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11107 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11108 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11109 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11110 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11111 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
11112 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
11113 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
11114 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
11115 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
11116 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
11117 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
11118 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
11119 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
11120 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
11121 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
11122 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
11123 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
11124 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
11125 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
11126 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
11127 RT_NOREF(pExtState);
11128}
11129
11130
11131IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11132 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11133{
11134 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11135 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11136 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11137 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11138 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11139 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11140 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11141 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11142 RT_NOREF(pExtState);
11143}
11144
11145
11146IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11147 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11148{
11149 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11150 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11151 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11152 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11153 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11154 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11155 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11156 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11157 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11158 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11159 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
11160 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
11161 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
11162 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
11163 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
11164 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
11165 RT_NOREF(pExtState);
11166}
11167
11168
11169IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11170 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11171{
11172 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11173 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11174 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11175 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11176 RT_NOREF(pExtState);
11177}
11178
11179
11180IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11181 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11182{
11183 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11184 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11185 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11186 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11187 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11188 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11189 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11190 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11191 RT_NOREF(pExtState);
11192}
11193
11194
11195/*
11196 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11197 */
11198#ifdef IEM_WITHOUT_ASSEMBLY
11199
11200IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11201{
11202 RTUINT64U uSrc1 = { *puDst };
11203 RTUINT64U uSrc2 = { *puSrc };
11204 RTUINT64U uDst;
11205
11206 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11207 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11208 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11209 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11210 *puDst = uDst.u;
11211 RT_NOREF(pFpuState);
11212}
11213
11214
11215IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11216{
11217 RTUINT128U uSrc1 = *puDst;
11218
11219 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11220 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11221 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11222 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11223 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11224 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11225 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11226 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11227 RT_NOREF(pFpuState);
11228}
11229
11230#endif
11231
11232IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11233{
11234 RTUINT128U uSrc1 = *puDst;
11235
11236 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11237 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11238 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11239 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11240 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11241 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11242 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11243 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11244 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11245 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11246 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11247 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11248 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11249 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11250 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11251 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11252 RT_NOREF(pFpuState);
11253}
11254
11255
11256IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11257{
11258 RTUINT128U uSrc1 = *puDst;
11259
11260 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11261 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11262 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11263 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11264 RT_NOREF(pFpuState);
11265}
11266
11267
11268IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11269 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11270{
11271 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11272 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11273 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11274 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11275 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11276 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11277 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11278 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11279 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11280 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11281 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11282 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11283 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11284 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11285 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11286 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11287 RT_NOREF(pExtState);
11288}
11289
11290
11291IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11292 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11293{
11294 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11295 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11296 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11297 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11298 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11299 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11300 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11301 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11302 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11303 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11304 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11305 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11306 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11307 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11308 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11309 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11310 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11311 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11312 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11313 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11314 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11315 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11316 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11317 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11318 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11319 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11320 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11321 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11322 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11323 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11324 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11325 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11326 RT_NOREF(pExtState);
11327}
11328
11329
11330IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11331 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11332{
11333 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11334 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11335 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11336 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11337 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11338 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11339 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11340 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11341 RT_NOREF(pExtState);
11342}
11343
11344
11345IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11346 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11347{
11348 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11349 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11350 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11351 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11352 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11353 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11354 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11355 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11356 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11357 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11358 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11359 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11360 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11361 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11362 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11363 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11364 RT_NOREF(pExtState);
11365}
11366
11367
11368IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11369 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11370{
11371 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11372 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11373 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11374 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11375 RT_NOREF(pExtState);
11376}
11377
11378
11379IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11380 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11381{
11382 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11383 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11384 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11385 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11386 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11387 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11388 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11389 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11390 RT_NOREF(pExtState);
11391}
11392
11393
11394/*
11395 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11396 */
11397#ifdef IEM_WITHOUT_ASSEMBLY
11398
11399IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11400{
11401 RTUINT64U uSrc1 = { *puDst };
11402 RTUINT64U uSrc2 = { *puSrc };
11403 RTUINT64U uDst;
11404
11405 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11406 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11407 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11408 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11409 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11410 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11411 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11412 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11413 *puDst = uDst.u;
11414 RT_NOREF(pFpuState);
11415}
11416
11417
11418IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11419{
11420 RTUINT128U uSrc1 = *puDst;
11421
11422 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11423 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11424 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11425 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11426 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11427 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11428 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11429 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11430 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11431 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11432 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11433 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11434 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11435 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11436 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11437 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11438 RT_NOREF(pFpuState);
11439}
11440
11441#endif
11442
11443IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11444{
11445 RTUINT128U uSrc1 = *puDst;
11446
11447 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11448 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11449 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11450 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11451 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11452 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11453 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11454 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11455 RT_NOREF(pFpuState);
11456}
11457
11458
11459IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11460{
11461 RTUINT128U uSrc1 = *puDst;
11462
11463 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11464 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11465 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11466 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11467 RT_NOREF(pFpuState);
11468}
11469
11470
11471IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11472 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11473{
11474 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11475 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11476 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11477 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11478 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11479 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11480 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11481 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11482 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11483 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11484 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11485 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11486 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11487 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11488 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11489 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11490 RT_NOREF(pExtState);
11491}
11492
11493
11494IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11495 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11496{
11497 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11498 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11499 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11500 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11501 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11502 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11503 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11504 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11505 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11506 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11507 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11508 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11509 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11510 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11511 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11512 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11513 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11514 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11515 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11516 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11517 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11518 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11519 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11520 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11521 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11522 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11523 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11524 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11525 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11526 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11527 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11528 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11529 RT_NOREF(pExtState);
11530}
11531
11532
11533IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11534 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11535{
11536 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11537 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11538 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11539 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11540 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11541 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11542 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11543 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11544 RT_NOREF(pExtState);
11545}
11546
11547
11548IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11549 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11550{
11551 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11552 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11553 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11554 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11555 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11556 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11557 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11558 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11559 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11560 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11561 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11562 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11563 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11564 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11565 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11566 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11567 RT_NOREF(pExtState);
11568}
11569
11570
11571IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11572 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11573{
11574 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11575 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11576 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11577 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11578 RT_NOREF(pExtState);
11579}
11580
11581
11582IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11583 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11584{
11585 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11586 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11587 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11588 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11589 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11590 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11591 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11592 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11593 RT_NOREF(pExtState);
11594}
11595
11596
11597/*
11598 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11599 */
11600#ifdef IEM_WITHOUT_ASSEMBLY
11601
11602IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11603{
11604 RTUINT64U uSrc1 = { *puDst };
11605 RTUINT64U uSrc2 = { *puSrc };
11606 RTUINT64U uDst;
11607
11608 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11609 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11610 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11611 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11612 *puDst = uDst.u;
11613 RT_NOREF(pFpuState);
11614}
11615
11616
11617IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11618{
11619 RTUINT128U uSrc1 = *puDst;
11620
11621 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11622 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11623 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11624 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11625 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11626 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11627 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11628 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11629 RT_NOREF(pFpuState);
11630}
11631
11632#endif
11633
11634IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11635{
11636 RTUINT128U uSrc1 = *puDst;
11637
11638 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11639 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11640 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11641 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11642 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11643 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11644 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11645 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11646 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11647 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11648 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11649 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11650 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11651 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11652 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
11653 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
11654 RT_NOREF(pFpuState);
11655}
11656
11657
11658IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11659{
11660 RTUINT128U uSrc1 = *puDst;
11661
11662 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11663 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11664 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11665 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11666 RT_NOREF(pFpuState);
11667}
11668
11669
11670IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11671 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11672{
11673 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11674 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11675 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11676 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11677 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11678 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11679 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11680 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11681 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11682 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11683 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11684 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11685 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11686 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11687 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11688 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11689 RT_NOREF(pExtState);
11690}
11691
11692
11693IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11694 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11695{
11696 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11697 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11698 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11699 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11700 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11701 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11702 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11703 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11704 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11705 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11706 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11707 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11708 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11709 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11710 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11711 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11712 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11713 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11714 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11715 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11716 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11717 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11718 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11719 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11720 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11721 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11722 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11723 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11724 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11725 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11726 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11727 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11728 RT_NOREF(pExtState);
11729}
11730
11731
11732IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11733 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11734{
11735 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11736 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11737 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11738 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11739 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11740 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11741 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11742 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11743 RT_NOREF(pExtState);
11744}
11745
11746
11747IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11748 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11749{
11750 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11751 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11752 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11753 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11754 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11755 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11756 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11757 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11758 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11759 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11760 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
11761 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
11762 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
11763 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
11764 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
11765 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
11766 RT_NOREF(pExtState);
11767}
11768
11769
11770IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11771 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11772{
11773 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11774 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11775 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11776 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11777 RT_NOREF(pExtState);
11778}
11779
11780
11781IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11782 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11783{
11784 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11785 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11786 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11787 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11788 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11789 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11790 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11791 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11792 RT_NOREF(pExtState);
11793}
11794
11795
11796/*
11797 * PAVGB / VPAVGB / PAVGW / VPAVGW
11798 */
11799#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11800#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11801
11802#ifdef IEM_WITHOUT_ASSEMBLY
11803
11804IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11805{
11806 RTUINT64U uSrc1 = { *puDst };
11807 RTUINT64U uSrc2 = { *puSrc };
11808 RTUINT64U uDst;
11809
11810 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11811 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11812 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11813 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11814 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11815 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11816 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11817 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11818 *puDst = uDst.u;
11819}
11820
11821
11822IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11823{
11824 RTUINT128U uSrc1 = *puDst;
11825
11826 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11827 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11828 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11829 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11830 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11831 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11832 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11833 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11834 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11835 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11836 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11837 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11838 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11839 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11840 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11841 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11842}
11843
11844
11845IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11846{
11847 RTUINT64U uSrc1 = { *puDst };
11848 RTUINT64U uSrc2 = { *puSrc };
11849 RTUINT64U uDst;
11850
11851 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11852 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11853 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11854 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11855 *puDst = uDst.u;
11856}
11857
11858
11859IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11860{
11861 RTUINT128U uSrc1 = *puDst;
11862
11863 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11864 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11865 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11866 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11867 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11868 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11869 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11870 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11871}
11872
11873#endif
11874
11875IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11876{
11877 RTUINT128U uSrc1 = *puDst;
11878
11879 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11880 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11881 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11882 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11883 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11884 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11885 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11886 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11887 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11888 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11889 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11890 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11891 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11892 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11893 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11894 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11895}
11896
11897
11898IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11899{
11900 RTUINT128U uSrc1 = *puDst;
11901
11902 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11903 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11904 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11905 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11906 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11907 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11908 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11909 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11910 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11911 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11912 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11913 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11914 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11915 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11916 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11917 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11918}
11919
11920
11921IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11922{
11923 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11924 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11925 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11926 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11927 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11928 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11929 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11930 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11931 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11932 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11933 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11934 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11935 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11936 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11937 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11938 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11939}
11940
11941
11942IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11943{
11944 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11945 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11946 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11947 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11948 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11949 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11950 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11951 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11952 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11953 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11954 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11955 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11956 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11957 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11958 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11959 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11960 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11961 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11962 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11963 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11964 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11965 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11966 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11967 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11968 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11969 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11970 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11971 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11972 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11973 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11974 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11975 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11976}
11977
11978
11979IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11980{
11981 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11982 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11983 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11984 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11985 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11986 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11987 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11988 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11989}
11990
11991
11992IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11993{
11994 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11995 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11996 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11997 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11998 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11999 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12000 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12001 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12002 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
12003 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
12004 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
12005 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
12006 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
12007 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
12008 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
12009 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
12010}
12011
12012#undef PAVGB_EXEC
12013#undef PAVGW_EXEC
12014
12015
12016/*
12017 * PMOVMSKB / VPMOVMSKB
12018 */
12019#ifdef IEM_WITHOUT_ASSEMBLY
12020
12021IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
12022{
12023 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12024 uint64_t const uSrc = *pu64Src;
12025 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
12026 | ((uSrc >> (15-1)) & RT_BIT_64(1))
12027 | ((uSrc >> (23-2)) & RT_BIT_64(2))
12028 | ((uSrc >> (31-3)) & RT_BIT_64(3))
12029 | ((uSrc >> (39-4)) & RT_BIT_64(4))
12030 | ((uSrc >> (47-5)) & RT_BIT_64(5))
12031 | ((uSrc >> (55-6)) & RT_BIT_64(6))
12032 | ((uSrc >> (63-7)) & RT_BIT_64(7));
12033}
12034
12035
12036IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
12037{
12038 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12039 uint64_t const uSrc0 = pu128Src->QWords.qw0;
12040 uint64_t const uSrc1 = pu128Src->QWords.qw1;
12041 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12042 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12043 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12044 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12045 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12046 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12047 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12048 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12049 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12050 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12051 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12052 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12053 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12054 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12055 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12056 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
12057}
12058
12059#endif
12060
12061IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
12062{
12063 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12064 uint64_t const uSrc0 = puSrc->QWords.qw0;
12065 uint64_t const uSrc1 = puSrc->QWords.qw1;
12066 uint64_t const uSrc2 = puSrc->QWords.qw2;
12067 uint64_t const uSrc3 = puSrc->QWords.qw3;
12068 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12069 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12070 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12071 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12072 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12073 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12074 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12075 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12076 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12077 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12078 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12079 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12080 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12081 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12082 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12083 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
12084 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
12085 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
12086 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
12087 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
12088 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
12089 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
12090 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
12091 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
12092 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
12093 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
12094 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
12095 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
12096 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
12097 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
12098 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
12099 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
12100}
12101
12102
12103/*
12104 * [V]PSHUFB
12105 */
12106
12107IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12108{
12109 RTUINT64U const uSrc = { *puSrc };
12110 RTUINT64U const uDstIn = { *puDst };
12111 ASMCompilerBarrier();
12112 RTUINT64U uDstOut = { 0 };
12113 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
12114 {
12115 uint8_t idxSrc = uSrc.au8[iByte];
12116 if (!(idxSrc & 0x80))
12117 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
12118 }
12119 *puDst = uDstOut.u;
12120 RT_NOREF(pFpuState);
12121}
12122
12123
12124IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12125{
12126 RTUINT128U const uSrc = *puSrc;
12127 RTUINT128U const uDstIn = *puDst;
12128 ASMCompilerBarrier();
12129 puDst->au64[0] = 0;
12130 puDst->au64[1] = 0;
12131 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12132 {
12133 uint8_t idxSrc = uSrc.au8[iByte];
12134 if (!(idxSrc & 0x80))
12135 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
12136 }
12137 RT_NOREF(pFpuState);
12138}
12139
12140
12141IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
12142 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12143{
12144 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
12145 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
12146 ASMCompilerBarrier();
12147 puDst->au64[0] = 0;
12148 puDst->au64[1] = 0;
12149 for (unsigned iByte = 0; iByte < 16; iByte++)
12150 {
12151 uint8_t idxSrc = uSrc2.au8[iByte];
12152 if (!(idxSrc & 0x80))
12153 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12154 }
12155 RT_NOREF(pExtState);
12156}
12157
12158
12159IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
12160 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12161{
12162 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
12163 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
12164 ASMCompilerBarrier();
12165 puDst->au64[0] = 0;
12166 puDst->au64[1] = 0;
12167 puDst->au64[2] = 0;
12168 puDst->au64[3] = 0;
12169 for (unsigned iByte = 0; iByte < 16; iByte++)
12170 {
12171 uint8_t idxSrc = uSrc2.au8[iByte];
12172 if (!(idxSrc & 0x80))
12173 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12174 }
12175 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12176 {
12177 uint8_t idxSrc = uSrc2.au8[iByte];
12178 if (!(idxSrc & 0x80))
12179 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
12180 }
12181 RT_NOREF(pExtState);
12182}
12183
12184
12185/*
12186 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
12187 */
12188#ifdef IEM_WITHOUT_ASSEMBLY
12189
12190IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
12191{
12192 uint64_t const uSrc = *puSrc;
12193 ASMCompilerBarrier();
12194 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12195 uSrc >> (((bEvil >> 2) & 3) * 16),
12196 uSrc >> (((bEvil >> 4) & 3) * 16),
12197 uSrc >> (((bEvil >> 6) & 3) * 16));
12198}
12199
12200
12201IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12202{
12203 puDst->QWords.qw0 = puSrc->QWords.qw0;
12204 uint64_t const uSrc = puSrc->QWords.qw1;
12205 ASMCompilerBarrier();
12206 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12207 uSrc >> (((bEvil >> 2) & 3) * 16),
12208 uSrc >> (((bEvil >> 4) & 3) * 16),
12209 uSrc >> (((bEvil >> 6) & 3) * 16));
12210}
12211
12212#endif
12213
12214IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12215{
12216 puDst->QWords.qw0 = puSrc->QWords.qw0;
12217 uint64_t const uSrc1 = puSrc->QWords.qw1;
12218 puDst->QWords.qw2 = puSrc->QWords.qw2;
12219 uint64_t const uSrc3 = puSrc->QWords.qw3;
12220 ASMCompilerBarrier();
12221 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12222 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12223 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12224 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12225 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12226 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12227 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12228 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12229}
12230
12231#ifdef IEM_WITHOUT_ASSEMBLY
12232IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12233{
12234 puDst->QWords.qw1 = puSrc->QWords.qw1;
12235 uint64_t const uSrc = puSrc->QWords.qw0;
12236 ASMCompilerBarrier();
12237 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12238 uSrc >> (((bEvil >> 2) & 3) * 16),
12239 uSrc >> (((bEvil >> 4) & 3) * 16),
12240 uSrc >> (((bEvil >> 6) & 3) * 16));
12241
12242}
12243#endif
12244
12245
12246IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12247{
12248 puDst->QWords.qw3 = puSrc->QWords.qw3;
12249 uint64_t const uSrc2 = puSrc->QWords.qw2;
12250 puDst->QWords.qw1 = puSrc->QWords.qw1;
12251 uint64_t const uSrc0 = puSrc->QWords.qw0;
12252 ASMCompilerBarrier();
12253 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12254 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12255 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12256 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12257 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12258 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12259 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12260 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12261
12262}
12263
12264
12265#ifdef IEM_WITHOUT_ASSEMBLY
12266IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12267{
12268 RTUINT128U const uSrc = *puSrc;
12269 ASMCompilerBarrier();
12270 puDst->au32[0] = uSrc.au32[bEvil & 3];
12271 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12272 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12273 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12274}
12275#endif
12276
12277
12278IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12279{
12280 RTUINT256U const uSrc = *puSrc;
12281 ASMCompilerBarrier();
12282 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12283 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12284 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12285 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12286 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12287 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12288 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12289 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12290}
12291
12292
12293/*
12294 * PUNPCKHBW - high bytes -> words
12295 */
12296#ifdef IEM_WITHOUT_ASSEMBLY
12297
12298IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12299{
12300 RTUINT64U const uSrc2 = { *puSrc };
12301 RTUINT64U const uSrc1 = { *puDst };
12302 ASMCompilerBarrier();
12303 RTUINT64U uDstOut;
12304 uDstOut.au8[0] = uSrc1.au8[4];
12305 uDstOut.au8[1] = uSrc2.au8[4];
12306 uDstOut.au8[2] = uSrc1.au8[5];
12307 uDstOut.au8[3] = uSrc2.au8[5];
12308 uDstOut.au8[4] = uSrc1.au8[6];
12309 uDstOut.au8[5] = uSrc2.au8[6];
12310 uDstOut.au8[6] = uSrc1.au8[7];
12311 uDstOut.au8[7] = uSrc2.au8[7];
12312 *puDst = uDstOut.u;
12313}
12314
12315
12316IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12317{
12318 RTUINT128U const uSrc2 = *puSrc;
12319 RTUINT128U const uSrc1 = *puDst;
12320 ASMCompilerBarrier();
12321 RTUINT128U uDstOut;
12322 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12323 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12324 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12325 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12326 uDstOut.au8[ 4] = uSrc1.au8[10];
12327 uDstOut.au8[ 5] = uSrc2.au8[10];
12328 uDstOut.au8[ 6] = uSrc1.au8[11];
12329 uDstOut.au8[ 7] = uSrc2.au8[11];
12330 uDstOut.au8[ 8] = uSrc1.au8[12];
12331 uDstOut.au8[ 9] = uSrc2.au8[12];
12332 uDstOut.au8[10] = uSrc1.au8[13];
12333 uDstOut.au8[11] = uSrc2.au8[13];
12334 uDstOut.au8[12] = uSrc1.au8[14];
12335 uDstOut.au8[13] = uSrc2.au8[14];
12336 uDstOut.au8[14] = uSrc1.au8[15];
12337 uDstOut.au8[15] = uSrc2.au8[15];
12338 *puDst = uDstOut;
12339}
12340
12341#endif
12342
12343IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12344{
12345 RTUINT128U const uSrc2 = *puSrc2;
12346 RTUINT128U const uSrc1 = *puSrc1;
12347 ASMCompilerBarrier();
12348 RTUINT128U uDstOut;
12349 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12350 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12351 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12352 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12353 uDstOut.au8[ 4] = uSrc1.au8[10];
12354 uDstOut.au8[ 5] = uSrc2.au8[10];
12355 uDstOut.au8[ 6] = uSrc1.au8[11];
12356 uDstOut.au8[ 7] = uSrc2.au8[11];
12357 uDstOut.au8[ 8] = uSrc1.au8[12];
12358 uDstOut.au8[ 9] = uSrc2.au8[12];
12359 uDstOut.au8[10] = uSrc1.au8[13];
12360 uDstOut.au8[11] = uSrc2.au8[13];
12361 uDstOut.au8[12] = uSrc1.au8[14];
12362 uDstOut.au8[13] = uSrc2.au8[14];
12363 uDstOut.au8[14] = uSrc1.au8[15];
12364 uDstOut.au8[15] = uSrc2.au8[15];
12365 *puDst = uDstOut;
12366}
12367
12368
12369IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12370{
12371 RTUINT256U const uSrc2 = *puSrc2;
12372 RTUINT256U const uSrc1 = *puSrc1;
12373 ASMCompilerBarrier();
12374 RTUINT256U uDstOut;
12375 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12376 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12377 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12378 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12379 uDstOut.au8[ 4] = uSrc1.au8[10];
12380 uDstOut.au8[ 5] = uSrc2.au8[10];
12381 uDstOut.au8[ 6] = uSrc1.au8[11];
12382 uDstOut.au8[ 7] = uSrc2.au8[11];
12383 uDstOut.au8[ 8] = uSrc1.au8[12];
12384 uDstOut.au8[ 9] = uSrc2.au8[12];
12385 uDstOut.au8[10] = uSrc1.au8[13];
12386 uDstOut.au8[11] = uSrc2.au8[13];
12387 uDstOut.au8[12] = uSrc1.au8[14];
12388 uDstOut.au8[13] = uSrc2.au8[14];
12389 uDstOut.au8[14] = uSrc1.au8[15];
12390 uDstOut.au8[15] = uSrc2.au8[15];
12391 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12392 uDstOut.au8[16] = uSrc1.au8[24];
12393 uDstOut.au8[17] = uSrc2.au8[24];
12394 uDstOut.au8[18] = uSrc1.au8[25];
12395 uDstOut.au8[19] = uSrc2.au8[25];
12396 uDstOut.au8[20] = uSrc1.au8[26];
12397 uDstOut.au8[21] = uSrc2.au8[26];
12398 uDstOut.au8[22] = uSrc1.au8[27];
12399 uDstOut.au8[23] = uSrc2.au8[27];
12400 uDstOut.au8[24] = uSrc1.au8[28];
12401 uDstOut.au8[25] = uSrc2.au8[28];
12402 uDstOut.au8[26] = uSrc1.au8[29];
12403 uDstOut.au8[27] = uSrc2.au8[29];
12404 uDstOut.au8[28] = uSrc1.au8[30];
12405 uDstOut.au8[29] = uSrc2.au8[30];
12406 uDstOut.au8[30] = uSrc1.au8[31];
12407 uDstOut.au8[31] = uSrc2.au8[31];
12408 *puDst = uDstOut;
12409}
12410
12411
12412/*
12413 * PUNPCKHBW - high words -> dwords
12414 */
12415#ifdef IEM_WITHOUT_ASSEMBLY
12416
12417IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12418{
12419 RTUINT64U const uSrc2 = { *puSrc };
12420 RTUINT64U const uSrc1 = { *puDst };
12421 ASMCompilerBarrier();
12422 RTUINT64U uDstOut;
12423 uDstOut.au16[0] = uSrc1.au16[2];
12424 uDstOut.au16[1] = uSrc2.au16[2];
12425 uDstOut.au16[2] = uSrc1.au16[3];
12426 uDstOut.au16[3] = uSrc2.au16[3];
12427 *puDst = uDstOut.u;
12428}
12429
12430
12431IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12432{
12433 RTUINT128U const uSrc2 = *puSrc;
12434 RTUINT128U const uSrc1 = *puDst;
12435 ASMCompilerBarrier();
12436 RTUINT128U uDstOut;
12437 uDstOut.au16[0] = uSrc1.au16[4];
12438 uDstOut.au16[1] = uSrc2.au16[4];
12439 uDstOut.au16[2] = uSrc1.au16[5];
12440 uDstOut.au16[3] = uSrc2.au16[5];
12441 uDstOut.au16[4] = uSrc1.au16[6];
12442 uDstOut.au16[5] = uSrc2.au16[6];
12443 uDstOut.au16[6] = uSrc1.au16[7];
12444 uDstOut.au16[7] = uSrc2.au16[7];
12445 *puDst = uDstOut;
12446}
12447
12448#endif
12449
12450IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12451{
12452 RTUINT128U const uSrc2 = *puSrc2;
12453 RTUINT128U const uSrc1 = *puSrc1;
12454 ASMCompilerBarrier();
12455 RTUINT128U uDstOut;
12456 uDstOut.au16[0] = uSrc1.au16[4];
12457 uDstOut.au16[1] = uSrc2.au16[4];
12458 uDstOut.au16[2] = uSrc1.au16[5];
12459 uDstOut.au16[3] = uSrc2.au16[5];
12460 uDstOut.au16[4] = uSrc1.au16[6];
12461 uDstOut.au16[5] = uSrc2.au16[6];
12462 uDstOut.au16[6] = uSrc1.au16[7];
12463 uDstOut.au16[7] = uSrc2.au16[7];
12464 *puDst = uDstOut;
12465}
12466
12467
12468IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12469{
12470 RTUINT256U const uSrc2 = *puSrc2;
12471 RTUINT256U const uSrc1 = *puSrc1;
12472 ASMCompilerBarrier();
12473 RTUINT256U uDstOut;
12474 uDstOut.au16[0] = uSrc1.au16[4];
12475 uDstOut.au16[1] = uSrc2.au16[4];
12476 uDstOut.au16[2] = uSrc1.au16[5];
12477 uDstOut.au16[3] = uSrc2.au16[5];
12478 uDstOut.au16[4] = uSrc1.au16[6];
12479 uDstOut.au16[5] = uSrc2.au16[6];
12480 uDstOut.au16[6] = uSrc1.au16[7];
12481 uDstOut.au16[7] = uSrc2.au16[7];
12482
12483 uDstOut.au16[8] = uSrc1.au16[12];
12484 uDstOut.au16[9] = uSrc2.au16[12];
12485 uDstOut.au16[10] = uSrc1.au16[13];
12486 uDstOut.au16[11] = uSrc2.au16[13];
12487 uDstOut.au16[12] = uSrc1.au16[14];
12488 uDstOut.au16[13] = uSrc2.au16[14];
12489 uDstOut.au16[14] = uSrc1.au16[15];
12490 uDstOut.au16[15] = uSrc2.au16[15];
12491 *puDst = uDstOut;
12492}
12493
12494
12495/*
12496 * PUNPCKHBW - high dwords -> qword(s)
12497 */
12498#ifdef IEM_WITHOUT_ASSEMBLY
12499
12500IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12501{
12502 RTUINT64U const uSrc2 = { *puSrc };
12503 RTUINT64U const uSrc1 = { *puDst };
12504 ASMCompilerBarrier();
12505 RTUINT64U uDstOut;
12506 uDstOut.au32[0] = uSrc1.au32[1];
12507 uDstOut.au32[1] = uSrc2.au32[1];
12508 *puDst = uDstOut.u;
12509}
12510
12511
12512IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12513{
12514 RTUINT128U const uSrc2 = *puSrc;
12515 RTUINT128U const uSrc1 = *puDst;
12516 ASMCompilerBarrier();
12517 RTUINT128U uDstOut;
12518 uDstOut.au32[0] = uSrc1.au32[2];
12519 uDstOut.au32[1] = uSrc2.au32[2];
12520 uDstOut.au32[2] = uSrc1.au32[3];
12521 uDstOut.au32[3] = uSrc2.au32[3];
12522 *puDst = uDstOut;
12523}
12524
12525#endif
12526
12527IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12528{
12529 RTUINT128U const uSrc2 = *puSrc2;
12530 RTUINT128U const uSrc1 = *puSrc1;
12531 ASMCompilerBarrier();
12532 RTUINT128U uDstOut;
12533 uDstOut.au32[0] = uSrc1.au32[2];
12534 uDstOut.au32[1] = uSrc2.au32[2];
12535 uDstOut.au32[2] = uSrc1.au32[3];
12536 uDstOut.au32[3] = uSrc2.au32[3];
12537 *puDst = uDstOut;
12538}
12539
12540
12541IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12542{
12543 RTUINT256U const uSrc2 = *puSrc2;
12544 RTUINT256U const uSrc1 = *puSrc1;
12545 ASMCompilerBarrier();
12546 RTUINT256U uDstOut;
12547 uDstOut.au32[0] = uSrc1.au32[2];
12548 uDstOut.au32[1] = uSrc2.au32[2];
12549 uDstOut.au32[2] = uSrc1.au32[3];
12550 uDstOut.au32[3] = uSrc2.au32[3];
12551
12552 uDstOut.au32[4] = uSrc1.au32[6];
12553 uDstOut.au32[5] = uSrc2.au32[6];
12554 uDstOut.au32[6] = uSrc1.au32[7];
12555 uDstOut.au32[7] = uSrc2.au32[7];
12556 *puDst = uDstOut;
12557}
12558
12559
12560/*
12561 * PUNPCKHQDQ -> High qwords -> double qword(s).
12562 */
12563#ifdef IEM_WITHOUT_ASSEMBLY
12564IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12565{
12566 RTUINT128U const uSrc2 = *puSrc;
12567 RTUINT128U const uSrc1 = *puDst;
12568 ASMCompilerBarrier();
12569 RTUINT128U uDstOut;
12570 uDstOut.au64[0] = uSrc1.au64[1];
12571 uDstOut.au64[1] = uSrc2.au64[1];
12572 *puDst = uDstOut;
12573}
12574#endif
12575
12576
12577IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12578{
12579 RTUINT128U const uSrc2 = *puSrc2;
12580 RTUINT128U const uSrc1 = *puSrc1;
12581 ASMCompilerBarrier();
12582 RTUINT128U uDstOut;
12583 uDstOut.au64[0] = uSrc1.au64[1];
12584 uDstOut.au64[1] = uSrc2.au64[1];
12585 *puDst = uDstOut;
12586}
12587
12588
12589IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12590{
12591 RTUINT256U const uSrc2 = *puSrc2;
12592 RTUINT256U const uSrc1 = *puSrc1;
12593 ASMCompilerBarrier();
12594 RTUINT256U uDstOut;
12595 uDstOut.au64[0] = uSrc1.au64[1];
12596 uDstOut.au64[1] = uSrc2.au64[1];
12597
12598 uDstOut.au64[2] = uSrc1.au64[3];
12599 uDstOut.au64[3] = uSrc2.au64[3];
12600 *puDst = uDstOut;
12601}
12602
12603
12604/*
12605 * PUNPCKLBW - low bytes -> words
12606 */
12607#ifdef IEM_WITHOUT_ASSEMBLY
12608
12609IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12610{
12611 RTUINT64U const uSrc2 = { *puSrc };
12612 RTUINT64U const uSrc1 = { *puDst };
12613 ASMCompilerBarrier();
12614 RTUINT64U uDstOut;
12615 uDstOut.au8[0] = uSrc1.au8[0];
12616 uDstOut.au8[1] = uSrc2.au8[0];
12617 uDstOut.au8[2] = uSrc1.au8[1];
12618 uDstOut.au8[3] = uSrc2.au8[1];
12619 uDstOut.au8[4] = uSrc1.au8[2];
12620 uDstOut.au8[5] = uSrc2.au8[2];
12621 uDstOut.au8[6] = uSrc1.au8[3];
12622 uDstOut.au8[7] = uSrc2.au8[3];
12623 *puDst = uDstOut.u;
12624}
12625
12626
12627IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12628{
12629 RTUINT128U const uSrc2 = *puSrc;
12630 RTUINT128U const uSrc1 = *puDst;
12631 ASMCompilerBarrier();
12632 RTUINT128U uDstOut;
12633 uDstOut.au8[ 0] = uSrc1.au8[0];
12634 uDstOut.au8[ 1] = uSrc2.au8[0];
12635 uDstOut.au8[ 2] = uSrc1.au8[1];
12636 uDstOut.au8[ 3] = uSrc2.au8[1];
12637 uDstOut.au8[ 4] = uSrc1.au8[2];
12638 uDstOut.au8[ 5] = uSrc2.au8[2];
12639 uDstOut.au8[ 6] = uSrc1.au8[3];
12640 uDstOut.au8[ 7] = uSrc2.au8[3];
12641 uDstOut.au8[ 8] = uSrc1.au8[4];
12642 uDstOut.au8[ 9] = uSrc2.au8[4];
12643 uDstOut.au8[10] = uSrc1.au8[5];
12644 uDstOut.au8[11] = uSrc2.au8[5];
12645 uDstOut.au8[12] = uSrc1.au8[6];
12646 uDstOut.au8[13] = uSrc2.au8[6];
12647 uDstOut.au8[14] = uSrc1.au8[7];
12648 uDstOut.au8[15] = uSrc2.au8[7];
12649 *puDst = uDstOut;
12650}
12651
12652#endif
12653
12654IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12655{
12656 RTUINT128U const uSrc2 = *puSrc2;
12657 RTUINT128U const uSrc1 = *puSrc1;
12658 ASMCompilerBarrier();
12659 RTUINT128U uDstOut;
12660 uDstOut.au8[ 0] = uSrc1.au8[0];
12661 uDstOut.au8[ 1] = uSrc2.au8[0];
12662 uDstOut.au8[ 2] = uSrc1.au8[1];
12663 uDstOut.au8[ 3] = uSrc2.au8[1];
12664 uDstOut.au8[ 4] = uSrc1.au8[2];
12665 uDstOut.au8[ 5] = uSrc2.au8[2];
12666 uDstOut.au8[ 6] = uSrc1.au8[3];
12667 uDstOut.au8[ 7] = uSrc2.au8[3];
12668 uDstOut.au8[ 8] = uSrc1.au8[4];
12669 uDstOut.au8[ 9] = uSrc2.au8[4];
12670 uDstOut.au8[10] = uSrc1.au8[5];
12671 uDstOut.au8[11] = uSrc2.au8[5];
12672 uDstOut.au8[12] = uSrc1.au8[6];
12673 uDstOut.au8[13] = uSrc2.au8[6];
12674 uDstOut.au8[14] = uSrc1.au8[7];
12675 uDstOut.au8[15] = uSrc2.au8[7];
12676 *puDst = uDstOut;
12677}
12678
12679
12680IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12681{
12682 RTUINT256U const uSrc2 = *puSrc2;
12683 RTUINT256U const uSrc1 = *puSrc1;
12684 ASMCompilerBarrier();
12685 RTUINT256U uDstOut;
12686 uDstOut.au8[ 0] = uSrc1.au8[0];
12687 uDstOut.au8[ 1] = uSrc2.au8[0];
12688 uDstOut.au8[ 2] = uSrc1.au8[1];
12689 uDstOut.au8[ 3] = uSrc2.au8[1];
12690 uDstOut.au8[ 4] = uSrc1.au8[2];
12691 uDstOut.au8[ 5] = uSrc2.au8[2];
12692 uDstOut.au8[ 6] = uSrc1.au8[3];
12693 uDstOut.au8[ 7] = uSrc2.au8[3];
12694 uDstOut.au8[ 8] = uSrc1.au8[4];
12695 uDstOut.au8[ 9] = uSrc2.au8[4];
12696 uDstOut.au8[10] = uSrc1.au8[5];
12697 uDstOut.au8[11] = uSrc2.au8[5];
12698 uDstOut.au8[12] = uSrc1.au8[6];
12699 uDstOut.au8[13] = uSrc2.au8[6];
12700 uDstOut.au8[14] = uSrc1.au8[7];
12701 uDstOut.au8[15] = uSrc2.au8[7];
12702 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12703 uDstOut.au8[16] = uSrc1.au8[16];
12704 uDstOut.au8[17] = uSrc2.au8[16];
12705 uDstOut.au8[18] = uSrc1.au8[17];
12706 uDstOut.au8[19] = uSrc2.au8[17];
12707 uDstOut.au8[20] = uSrc1.au8[18];
12708 uDstOut.au8[21] = uSrc2.au8[18];
12709 uDstOut.au8[22] = uSrc1.au8[19];
12710 uDstOut.au8[23] = uSrc2.au8[19];
12711 uDstOut.au8[24] = uSrc1.au8[20];
12712 uDstOut.au8[25] = uSrc2.au8[20];
12713 uDstOut.au8[26] = uSrc1.au8[21];
12714 uDstOut.au8[27] = uSrc2.au8[21];
12715 uDstOut.au8[28] = uSrc1.au8[22];
12716 uDstOut.au8[29] = uSrc2.au8[22];
12717 uDstOut.au8[30] = uSrc1.au8[23];
12718 uDstOut.au8[31] = uSrc2.au8[23];
12719 *puDst = uDstOut;
12720}
12721
12722
12723/*
12724 * PUNPCKLBW - low words -> dwords
12725 */
12726#ifdef IEM_WITHOUT_ASSEMBLY
12727
12728IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12729{
12730 RTUINT64U const uSrc2 = { *puSrc };
12731 RTUINT64U const uSrc1 = { *puDst };
12732 ASMCompilerBarrier();
12733 RTUINT64U uDstOut;
12734 uDstOut.au16[0] = uSrc1.au16[0];
12735 uDstOut.au16[1] = uSrc2.au16[0];
12736 uDstOut.au16[2] = uSrc1.au16[1];
12737 uDstOut.au16[3] = uSrc2.au16[1];
12738 *puDst = uDstOut.u;
12739}
12740
12741
12742IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12743{
12744 RTUINT128U const uSrc2 = *puSrc;
12745 RTUINT128U const uSrc1 = *puDst;
12746 ASMCompilerBarrier();
12747 RTUINT128U uDstOut;
12748 uDstOut.au16[0] = uSrc1.au16[0];
12749 uDstOut.au16[1] = uSrc2.au16[0];
12750 uDstOut.au16[2] = uSrc1.au16[1];
12751 uDstOut.au16[3] = uSrc2.au16[1];
12752 uDstOut.au16[4] = uSrc1.au16[2];
12753 uDstOut.au16[5] = uSrc2.au16[2];
12754 uDstOut.au16[6] = uSrc1.au16[3];
12755 uDstOut.au16[7] = uSrc2.au16[3];
12756 *puDst = uDstOut;
12757}
12758
12759#endif
12760
12761IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12762{
12763 RTUINT128U const uSrc2 = *puSrc2;
12764 RTUINT128U const uSrc1 = *puSrc1;
12765 ASMCompilerBarrier();
12766 RTUINT128U uDstOut;
12767 uDstOut.au16[0] = uSrc1.au16[0];
12768 uDstOut.au16[1] = uSrc2.au16[0];
12769 uDstOut.au16[2] = uSrc1.au16[1];
12770 uDstOut.au16[3] = uSrc2.au16[1];
12771 uDstOut.au16[4] = uSrc1.au16[2];
12772 uDstOut.au16[5] = uSrc2.au16[2];
12773 uDstOut.au16[6] = uSrc1.au16[3];
12774 uDstOut.au16[7] = uSrc2.au16[3];
12775 *puDst = uDstOut;
12776}
12777
12778
12779IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12780{
12781 RTUINT256U const uSrc2 = *puSrc2;
12782 RTUINT256U const uSrc1 = *puSrc1;
12783 ASMCompilerBarrier();
12784 RTUINT256U uDstOut;
12785 uDstOut.au16[0] = uSrc1.au16[0];
12786 uDstOut.au16[1] = uSrc2.au16[0];
12787 uDstOut.au16[2] = uSrc1.au16[1];
12788 uDstOut.au16[3] = uSrc2.au16[1];
12789 uDstOut.au16[4] = uSrc1.au16[2];
12790 uDstOut.au16[5] = uSrc2.au16[2];
12791 uDstOut.au16[6] = uSrc1.au16[3];
12792 uDstOut.au16[7] = uSrc2.au16[3];
12793
12794 uDstOut.au16[8] = uSrc1.au16[8];
12795 uDstOut.au16[9] = uSrc2.au16[8];
12796 uDstOut.au16[10] = uSrc1.au16[9];
12797 uDstOut.au16[11] = uSrc2.au16[9];
12798 uDstOut.au16[12] = uSrc1.au16[10];
12799 uDstOut.au16[13] = uSrc2.au16[10];
12800 uDstOut.au16[14] = uSrc1.au16[11];
12801 uDstOut.au16[15] = uSrc2.au16[11];
12802 *puDst = uDstOut;
12803}
12804
12805
12806/*
12807 * PUNPCKLBW - low dwords -> qword(s)
12808 */
12809#ifdef IEM_WITHOUT_ASSEMBLY
12810
12811IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12812{
12813 RTUINT64U const uSrc2 = { *puSrc };
12814 RTUINT64U const uSrc1 = { *puDst };
12815 ASMCompilerBarrier();
12816 RTUINT64U uDstOut;
12817 uDstOut.au32[0] = uSrc1.au32[0];
12818 uDstOut.au32[1] = uSrc2.au32[0];
12819 *puDst = uDstOut.u;
12820}
12821
12822
12823IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12824{
12825 RTUINT128U const uSrc2 = *puSrc;
12826 RTUINT128U const uSrc1 = *puDst;
12827 ASMCompilerBarrier();
12828 RTUINT128U uDstOut;
12829 uDstOut.au32[0] = uSrc1.au32[0];
12830 uDstOut.au32[1] = uSrc2.au32[0];
12831 uDstOut.au32[2] = uSrc1.au32[1];
12832 uDstOut.au32[3] = uSrc2.au32[1];
12833 *puDst = uDstOut;
12834}
12835
12836#endif
12837
12838IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12839{
12840 RTUINT128U const uSrc2 = *puSrc2;
12841 RTUINT128U const uSrc1 = *puSrc1;
12842 ASMCompilerBarrier();
12843 RTUINT128U uDstOut;
12844 uDstOut.au32[0] = uSrc1.au32[0];
12845 uDstOut.au32[1] = uSrc2.au32[0];
12846 uDstOut.au32[2] = uSrc1.au32[1];
12847 uDstOut.au32[3] = uSrc2.au32[1];
12848 *puDst = uDstOut;
12849}
12850
12851
12852IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12853{
12854 RTUINT256U const uSrc2 = *puSrc2;
12855 RTUINT256U const uSrc1 = *puSrc1;
12856 ASMCompilerBarrier();
12857 RTUINT256U uDstOut;
12858 uDstOut.au32[0] = uSrc1.au32[0];
12859 uDstOut.au32[1] = uSrc2.au32[0];
12860 uDstOut.au32[2] = uSrc1.au32[1];
12861 uDstOut.au32[3] = uSrc2.au32[1];
12862
12863 uDstOut.au32[4] = uSrc1.au32[4];
12864 uDstOut.au32[5] = uSrc2.au32[4];
12865 uDstOut.au32[6] = uSrc1.au32[5];
12866 uDstOut.au32[7] = uSrc2.au32[5];
12867 *puDst = uDstOut;
12868}
12869
12870
12871/*
12872 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12873 */
12874#ifdef IEM_WITHOUT_ASSEMBLY
12875IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12876{
12877 RTUINT128U const uSrc2 = *puSrc;
12878 RTUINT128U const uSrc1 = *puDst;
12879 ASMCompilerBarrier();
12880 RTUINT128U uDstOut;
12881 uDstOut.au64[0] = uSrc1.au64[0];
12882 uDstOut.au64[1] = uSrc2.au64[0];
12883 *puDst = uDstOut;
12884}
12885#endif
12886
12887
12888IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12889{
12890 RTUINT128U const uSrc2 = *puSrc2;
12891 RTUINT128U const uSrc1 = *puSrc1;
12892 ASMCompilerBarrier();
12893 RTUINT128U uDstOut;
12894 uDstOut.au64[0] = uSrc1.au64[0];
12895 uDstOut.au64[1] = uSrc2.au64[0];
12896 *puDst = uDstOut;
12897}
12898
12899
12900IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12901{
12902 RTUINT256U const uSrc2 = *puSrc2;
12903 RTUINT256U const uSrc1 = *puSrc1;
12904 ASMCompilerBarrier();
12905 RTUINT256U uDstOut;
12906 uDstOut.au64[0] = uSrc1.au64[0];
12907 uDstOut.au64[1] = uSrc2.au64[0];
12908
12909 uDstOut.au64[2] = uSrc1.au64[2];
12910 uDstOut.au64[3] = uSrc2.au64[2];
12911 *puDst = uDstOut;
12912}
12913
12914
12915/*
12916 * PACKSSWB - signed words -> signed bytes
12917 */
12918
12919#ifdef IEM_WITHOUT_ASSEMBLY
12920
12921IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12922{
12923 RTUINT64U const uSrc2 = { *puSrc };
12924 RTUINT64U const uSrc1 = { *puDst };
12925 ASMCompilerBarrier();
12926 RTUINT64U uDstOut;
12927 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12928 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12929 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12930 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12931 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12932 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12933 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12934 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12935 *puDst = uDstOut.u;
12936}
12937
12938
12939IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12940{
12941 RTUINT128U const uSrc2 = *puSrc;
12942 RTUINT128U const uSrc1 = *puDst;
12943 ASMCompilerBarrier();
12944 RTUINT128U uDstOut;
12945 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12946 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12947 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12948 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12949 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12950 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12951 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12952 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12953 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12954 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12955 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12956 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12957 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12958 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12959 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12960 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12961 *puDst = uDstOut;
12962}
12963
12964#endif
12965
12966IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12967{
12968 RTUINT128U const uSrc2 = *puSrc2;
12969 RTUINT128U const uSrc1 = *puSrc1;
12970 ASMCompilerBarrier();
12971 RTUINT128U uDstOut;
12972 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12973 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12974 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12975 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12976 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12977 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12978 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12979 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12980 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12981 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12982 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12983 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12984 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12985 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12986 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12987 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12988 *puDst = uDstOut;
12989}
12990
12991
12992IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12993{
12994 RTUINT256U const uSrc2 = *puSrc2;
12995 RTUINT256U const uSrc1 = *puSrc1;
12996 ASMCompilerBarrier();
12997 RTUINT256U uDstOut;
12998 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12999 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13000 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13001 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13002 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13003 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13004 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13005 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13006 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13007 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13008 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13009 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13010 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13011 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13012 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13013 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13014
13015 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
13016 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
13017 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
13018 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
13019 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
13020 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
13021 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
13022 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
13023 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
13024 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
13025 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
13026 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
13027 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
13028 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
13029 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
13030 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
13031 *puDst = uDstOut;
13032}
13033
13034
13035/*
13036 * PACKUSWB - signed words -> unsigned bytes
13037 */
13038#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
13039 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
13040 ? (uint8_t)(a_iWord) \
13041 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
13042
13043#ifdef IEM_WITHOUT_ASSEMBLY
13044
13045IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13046{
13047 RTUINT64U const uSrc2 = { *puSrc };
13048 RTUINT64U const uSrc1 = { *puDst };
13049 ASMCompilerBarrier();
13050 RTUINT64U uDstOut;
13051 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13052 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13053 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13054 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13055 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13056 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13057 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13058 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13059 *puDst = uDstOut.u;
13060}
13061
13062
13063IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13064{
13065 RTUINT128U const uSrc2 = *puSrc;
13066 RTUINT128U const uSrc1 = *puDst;
13067 ASMCompilerBarrier();
13068 RTUINT128U uDstOut;
13069 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13070 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13071 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13072 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13073 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13074 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13075 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13076 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13077 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13078 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13079 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13080 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13081 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13082 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13083 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13084 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13085 *puDst = uDstOut;
13086}
13087
13088#endif
13089
13090IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13091{
13092 RTUINT128U const uSrc2 = *puSrc2;
13093 RTUINT128U const uSrc1 = *puSrc1;
13094 ASMCompilerBarrier();
13095 RTUINT128U uDstOut;
13096 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13097 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13098 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13099 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13100 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13101 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13102 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13103 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13104 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13105 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13106 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13107 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13108 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13109 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13110 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13111 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13112 *puDst = uDstOut;
13113}
13114
13115
13116IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13117{
13118 RTUINT256U const uSrc2 = *puSrc2;
13119 RTUINT256U const uSrc1 = *puSrc1;
13120 ASMCompilerBarrier();
13121 RTUINT256U uDstOut;
13122 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13123 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13124 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13125 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13126 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13127 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13128 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13129 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13130 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13131 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13132 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13133 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13134 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13135 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13136 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13137 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13138
13139 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
13140 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
13141 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
13142 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
13143 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
13144 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
13145 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
13146 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
13147 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
13148 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
13149 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
13150 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
13151 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
13152 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
13153 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
13154 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
13155 *puDst = uDstOut;
13156}
13157
13158
13159/*
13160 * PACKSSDW - signed dwords -> signed words
13161 */
13162
13163#ifdef IEM_WITHOUT_ASSEMBLY
13164
13165IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13166{
13167 RTUINT64U const uSrc2 = { *puSrc };
13168 RTUINT64U const uSrc1 = { *puDst };
13169 ASMCompilerBarrier();
13170 RTUINT64U uDstOut;
13171 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13172 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13173 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13174 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13175 *puDst = uDstOut.u;
13176}
13177
13178
13179IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13180{
13181 RTUINT128U const uSrc2 = *puSrc;
13182 RTUINT128U const uSrc1 = *puDst;
13183 ASMCompilerBarrier();
13184 RTUINT128U uDstOut;
13185 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13186 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13187 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13188 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13189 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13190 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13191 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13192 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13193 *puDst = uDstOut;
13194}
13195
13196#endif
13197
13198IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13199{
13200 RTUINT128U const uSrc2 = *puSrc2;
13201 RTUINT128U const uSrc1 = *puSrc1;
13202 ASMCompilerBarrier();
13203 RTUINT128U uDstOut;
13204 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13205 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13206 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13207 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13208 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13209 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13210 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13211 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13212 *puDst = uDstOut;
13213}
13214
13215
13216IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13217{
13218 RTUINT256U const uSrc2 = *puSrc2;
13219 RTUINT256U const uSrc1 = *puSrc1;
13220 ASMCompilerBarrier();
13221 RTUINT256U uDstOut;
13222 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13223 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13224 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13225 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13226 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13227 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13228 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13229 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13230
13231 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13232 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13233 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13234 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13235 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13236 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13237 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13238 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13239 *puDst = uDstOut;
13240}
13241
13242
13243/*
13244 * PACKUSDW - signed dwords -> unsigned words
13245 */
13246#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13247 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13248 ? (uint16_t)(a_iDword) \
13249 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13250
13251#ifdef IEM_WITHOUT_ASSEMBLY
13252IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13253{
13254 RTUINT128U const uSrc2 = *puSrc;
13255 RTUINT128U const uSrc1 = *puDst;
13256 ASMCompilerBarrier();
13257 RTUINT128U uDstOut;
13258 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13259 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13260 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13261 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13262 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13263 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13264 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13265 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13266 *puDst = uDstOut;
13267}
13268#endif
13269
13270IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13271{
13272 RTUINT128U const uSrc2 = *puSrc2;
13273 RTUINT128U const uSrc1 = *puSrc1;
13274 ASMCompilerBarrier();
13275 RTUINT128U uDstOut;
13276 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13277 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13278 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13279 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13280 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13281 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13282 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13283 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13284 *puDst = uDstOut;
13285}
13286
13287
13288IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13289{
13290 RTUINT256U const uSrc2 = *puSrc2;
13291 RTUINT256U const uSrc1 = *puSrc1;
13292 ASMCompilerBarrier();
13293 RTUINT256U uDstOut;
13294 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13295 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13296 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13297 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13298 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13299 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13300 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13301 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13302
13303 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13304 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13305 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13306 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13307 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13308 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13309 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13310 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13311 *puDst = uDstOut;
13312}
13313
13314
13315/*
13316 * [V]PABSB / [V]PABSW / [V]PABSD
13317 */
13318
13319IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13320{
13321 RTUINT64U const uSrc = { *puSrc };
13322 RTUINT64U uDstOut = { 0 };
13323
13324 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13325 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13326 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13327 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13328 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13329 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13330 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13331 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13332 *puDst = uDstOut.u;
13333 RT_NOREF(pFpuState);
13334}
13335
13336
13337IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13338{
13339 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13340 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13341 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13342 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13343 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13344 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13345 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13346 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13347 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13348 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13349 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13350 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13351 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13352 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13353 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13354 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13355 RT_NOREF(pFpuState);
13356}
13357
13358
13359IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13360{
13361 RTUINT64U const uSrc = { *puSrc };
13362 RTUINT64U uDstOut = { 0 };
13363
13364 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13365 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13366 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13367 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13368 *puDst = uDstOut.u;
13369 RT_NOREF(pFpuState);
13370}
13371
13372
13373IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13374{
13375 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13376 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13377 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13378 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13379 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13380 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13381 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13382 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13383 RT_NOREF(pFpuState);
13384}
13385
13386
13387IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13388{
13389 RTUINT64U const uSrc = { *puSrc };
13390 RTUINT64U uDstOut = { 0 };
13391
13392 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13393 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13394 *puDst = uDstOut.u;
13395 RT_NOREF(pFpuState);
13396}
13397
13398
13399IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13400{
13401 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13402 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13403 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13404 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13405 RT_NOREF(pFpuState);
13406}
13407
13408
13409IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13410{
13411 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13412 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13413 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13414 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13415 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13416 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13417 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13418 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13419 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13420 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13421 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13422 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13423 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13424 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13425 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13426 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13427}
13428
13429
13430IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13431{
13432 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13433 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13434 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13435 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13436 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13437 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13438 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13439 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13440 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13441 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13442 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13443 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13444 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13445 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13446 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13447 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13448 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13449 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13450 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13451 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13452 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13453 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13454 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13455 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13456 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13457 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13458 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13459 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13460 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13461 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13462 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13463 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13464}
13465
13466
13467IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13468{
13469 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13470 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13471 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13472 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13473 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13474 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13475 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13476 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13477}
13478
13479
13480IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13481{
13482 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13483 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13484 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13485 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13486 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13487 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13488 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13489 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13490 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13491 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13492 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13493 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13494 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13495 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13496 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13497 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13498}
13499
13500
13501IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13502{
13503 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13504 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13505 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13506 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13507}
13508
13509
13510IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13511{
13512 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13513 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13514 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13515 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13516 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13517 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13518 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13519 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13520}
13521
13522
13523/*
13524 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13525 */
13526IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13527{
13528 RTUINT64U uSrc1 = { *puDst };
13529 RTUINT64U uSrc2 = { *puSrc };
13530 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13531
13532 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13533 {
13534 if (uSrc2.ai8[i] < 0)
13535 uDst.ai8[i] = -uSrc1.ai8[i];
13536 else if (uSrc2.ai8[i] == 0)
13537 uDst.ai8[i] = 0;
13538 else /* uSrc2.ai8[i] > 0 */
13539 uDst.ai8[i] = uSrc1.ai8[i];
13540 }
13541
13542 *puDst = uDst.u;
13543 RT_NOREF(pFpuState);
13544}
13545
13546
13547IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13548{
13549 RTUINT128U uSrc1 = *puDst;
13550
13551 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13552 {
13553 if (puSrc->ai8[i] < 0)
13554 puDst->ai8[i] = -uSrc1.ai8[i];
13555 else if (puSrc->ai8[i] == 0)
13556 puDst->ai8[i] = 0;
13557 else /* puSrc->ai8[i] > 0 */
13558 puDst->ai8[i] = uSrc1.ai8[i];
13559 }
13560
13561 RT_NOREF(pFpuState);
13562}
13563
13564
13565IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13566{
13567 RTUINT64U uSrc1 = { *puDst };
13568 RTUINT64U uSrc2 = { *puSrc };
13569 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13570
13571 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13572 {
13573 if (uSrc2.ai16[i] < 0)
13574 uDst.ai16[i] = -uSrc1.ai16[i];
13575 else if (uSrc2.ai16[i] == 0)
13576 uDst.ai16[i] = 0;
13577 else /* uSrc2.ai16[i] > 0 */
13578 uDst.ai16[i] = uSrc1.ai16[i];
13579 }
13580
13581 *puDst = uDst.u;
13582 RT_NOREF(pFpuState);
13583}
13584
13585
13586IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13587{
13588 RTUINT128U uSrc1 = *puDst;
13589
13590 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13591 {
13592 if (puSrc->ai16[i] < 0)
13593 puDst->ai16[i] = -uSrc1.ai16[i];
13594 else if (puSrc->ai16[i] == 0)
13595 puDst->ai16[i] = 0;
13596 else /* puSrc->ai16[i] > 0 */
13597 puDst->ai16[i] = uSrc1.ai16[i];
13598 }
13599
13600 RT_NOREF(pFpuState);
13601}
13602
13603
13604IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13605{
13606 RTUINT64U uSrc1 = { *puDst };
13607 RTUINT64U uSrc2 = { *puSrc };
13608 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13609
13610 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13611 {
13612 if (uSrc2.ai32[i] < 0)
13613 uDst.ai32[i] = -uSrc1.ai32[i];
13614 else if (uSrc2.ai32[i] == 0)
13615 uDst.ai32[i] = 0;
13616 else /* uSrc2.ai32[i] > 0 */
13617 uDst.ai32[i] = uSrc1.ai32[i];
13618 }
13619
13620 *puDst = uDst.u;
13621 RT_NOREF(pFpuState);
13622}
13623
13624
13625IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13626{
13627 RTUINT128U uSrc1 = *puDst;
13628
13629 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13630 {
13631 if (puSrc->ai32[i] < 0)
13632 puDst->ai32[i] = -uSrc1.ai32[i];
13633 else if (puSrc->ai32[i] == 0)
13634 puDst->ai32[i] = 0;
13635 else /* puSrc->ai32[i] > 0 */
13636 puDst->ai32[i] = uSrc1.ai32[i];
13637 }
13638
13639 RT_NOREF(pFpuState);
13640}
13641
13642
13643IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13644{
13645 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13646 {
13647 if (puSrc2->ai8[i] < 0)
13648 puDst->ai8[i] = -puSrc1->ai8[i];
13649 else if (puSrc2->ai8[i] == 0)
13650 puDst->ai8[i] = 0;
13651 else /* puSrc2->ai8[i] > 0 */
13652 puDst->ai8[i] = puSrc1->ai8[i];
13653 }
13654}
13655
13656
13657IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13658{
13659 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13660 {
13661 if (puSrc2->ai8[i] < 0)
13662 puDst->ai8[i] = -puSrc1->ai8[i];
13663 else if (puSrc2->ai8[i] == 0)
13664 puDst->ai8[i] = 0;
13665 else /* puSrc2->ai8[i] > 0 */
13666 puDst->ai8[i] = puSrc1->ai8[i];
13667 }
13668}
13669
13670
13671IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13672{
13673 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13674 {
13675 if (puSrc2->ai16[i] < 0)
13676 puDst->ai16[i] = -puSrc1->ai16[i];
13677 else if (puSrc2->ai16[i] == 0)
13678 puDst->ai16[i] = 0;
13679 else /* puSrc2->ai16[i] > 0 */
13680 puDst->ai16[i] = puSrc1->ai16[i];
13681 }
13682}
13683
13684
13685IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13686{
13687 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13688 {
13689 if (puSrc2->ai16[i] < 0)
13690 puDst->ai16[i] = -puSrc1->ai16[i];
13691 else if (puSrc2->ai16[i] == 0)
13692 puDst->ai16[i] = 0;
13693 else /* puSrc2->ai16[i] > 0 */
13694 puDst->ai16[i] = puSrc1->ai16[i];
13695 }
13696}
13697
13698
13699IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13700{
13701 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13702 {
13703 if (puSrc2->ai32[i] < 0)
13704 puDst->ai32[i] = -puSrc1->ai32[i];
13705 else if (puSrc2->ai32[i] == 0)
13706 puDst->ai32[i] = 0;
13707 else /* puSrc2->ai32[i] > 0 */
13708 puDst->ai32[i] = puSrc1->ai32[i];
13709 }
13710}
13711
13712
13713IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13714{
13715 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13716 {
13717 if (puSrc2->ai32[i] < 0)
13718 puDst->ai32[i] = -puSrc1->ai32[i];
13719 else if (puSrc2->ai32[i] == 0)
13720 puDst->ai32[i] = 0;
13721 else /* puSrc2->ai32[i] > 0 */
13722 puDst->ai32[i] = puSrc1->ai32[i];
13723 }
13724}
13725
13726
13727/*
13728 * PHADDW / VPHADDW / PHADDD / VPHADDD
13729 */
13730IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13731{
13732 RTUINT64U uSrc1 = { *puDst };
13733 RTUINT64U uSrc2 = { *puSrc };
13734 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13735
13736 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13737 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13738 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13739 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13740 *puDst = uDst.u;
13741 RT_NOREF(pFpuState);
13742}
13743
13744
13745IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13746{
13747 RTUINT128U uSrc1 = *puDst;
13748
13749 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13750 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13751 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13752 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13753
13754 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13755 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13756 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13757 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13758 RT_NOREF(pFpuState);
13759}
13760
13761
13762IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13763{
13764 RTUINT64U uSrc1 = { *puDst };
13765 RTUINT64U uSrc2 = { *puSrc };
13766 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13767
13768 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13769 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
13770 *puDst = uDst.u;
13771 RT_NOREF(pFpuState);
13772}
13773
13774
13775IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13776{
13777 RTUINT128U uSrc1 = *puDst;
13778
13779 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13780 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13781
13782 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13783 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13784 RT_NOREF(pFpuState);
13785}
13786
13787
13788IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13789{
13790 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13791
13792 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13793 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13794 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13795 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13796
13797 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13798 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13799 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13800 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13801
13802 puDst->au64[0] = uDst.au64[0];
13803 puDst->au64[1] = uDst.au64[1];
13804}
13805
13806
13807IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13808{
13809 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13810
13811 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13812 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13813 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13814 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13815 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13816 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13817 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13818 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13819
13820 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13821 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13822 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13823 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13824 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13825 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13826 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13827 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13828
13829 puDst->au64[0] = uDst.au64[0];
13830 puDst->au64[1] = uDst.au64[1];
13831 puDst->au64[2] = uDst.au64[2];
13832 puDst->au64[3] = uDst.au64[3];
13833}
13834
13835
13836IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13837{
13838 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13839
13840 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13841 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13842
13843 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13844 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13845
13846 puDst->au64[0] = uDst.au64[0];
13847 puDst->au64[1] = uDst.au64[1];
13848}
13849
13850
13851IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13852{
13853 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13854
13855 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13856 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13857 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13858 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13859
13860 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13861 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13862 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13863 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13864
13865 puDst->au64[0] = uDst.au64[0];
13866 puDst->au64[1] = uDst.au64[1];
13867 puDst->au64[2] = uDst.au64[2];
13868 puDst->au64[3] = uDst.au64[3];
13869}
13870
13871
13872/*
13873 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13874 */
13875IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13876{
13877 RTUINT64U uSrc1 = { *puDst };
13878 RTUINT64U uSrc2 = { *puSrc };
13879 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13880
13881 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13882 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13883 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13884 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13885 *puDst = uDst.u;
13886 RT_NOREF(pFpuState);
13887}
13888
13889
13890IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13891{
13892 RTUINT128U uSrc1 = *puDst;
13893
13894 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13895 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13896 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13897 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13898
13899 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13900 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13901 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13902 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13903 RT_NOREF(pFpuState);
13904}
13905
13906
13907IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13908{
13909 RTUINT64U uSrc1 = { *puDst };
13910 RTUINT64U uSrc2 = { *puSrc };
13911 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13912
13913 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13914 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13915 *puDst = uDst.u;
13916 RT_NOREF(pFpuState);
13917}
13918
13919
13920IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13921{
13922 RTUINT128U uSrc1 = *puDst;
13923
13924 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13925 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13926
13927 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13928 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13929 RT_NOREF(pFpuState);
13930}
13931
13932
13933IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13934{
13935 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13936
13937 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13938 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13939 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13940 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13941
13942 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13943 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13944 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13945 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13946
13947 puDst->au64[0] = uDst.au64[0];
13948 puDst->au64[1] = uDst.au64[1];
13949}
13950
13951
13952IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13953{
13954 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13955
13956 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13957 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13958 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13959 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13960 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13961 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13962 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13963 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13964
13965 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13966 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13967 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13968 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13969 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13970 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13971 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13972 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13973
13974 puDst->au64[0] = uDst.au64[0];
13975 puDst->au64[1] = uDst.au64[1];
13976 puDst->au64[2] = uDst.au64[2];
13977 puDst->au64[3] = uDst.au64[3];
13978}
13979
13980
13981IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13982{
13983 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13984
13985 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13986 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13987
13988 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13989 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13990
13991 puDst->au64[0] = uDst.au64[0];
13992 puDst->au64[1] = uDst.au64[1];
13993}
13994
13995
13996IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13997{
13998 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13999
14000 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
14001 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
14002 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
14003 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
14004
14005 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
14006 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
14007 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
14008 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
14009
14010 puDst->au64[0] = uDst.au64[0];
14011 puDst->au64[1] = uDst.au64[1];
14012 puDst->au64[2] = uDst.au64[2];
14013 puDst->au64[3] = uDst.au64[3];
14014}
14015
14016
14017/*
14018 * PHADDSW / VPHADDSW
14019 */
14020IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14021{
14022 RTUINT64U uSrc1 = { *puDst };
14023 RTUINT64U uSrc2 = { *puSrc };
14024 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14025
14026 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14027 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14028 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
14029 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
14030 *puDst = uDst.u;
14031 RT_NOREF(pFpuState);
14032}
14033
14034
14035IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14036{
14037 RTUINT128U uSrc1 = *puDst;
14038
14039 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14040 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14041 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
14042 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
14043
14044 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
14045 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
14046 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
14047 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
14048 RT_NOREF(pFpuState);
14049}
14050
14051
14052IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14053{
14054 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14055
14056 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
14057 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
14058 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
14059 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
14060
14061 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
14062 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
14063 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
14064 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
14065
14066 puDst->au64[0] = uDst.au64[0];
14067 puDst->au64[1] = uDst.au64[1];
14068}
14069
14070
14071IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14072{
14073 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14074
14075 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
14076 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
14077 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
14078 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
14079 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
14080 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
14081 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
14082 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
14083
14084 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
14085 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
14086 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
14087 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
14088 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
14089 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
14090 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
14091 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
14092
14093 puDst->au64[0] = uDst.au64[0];
14094 puDst->au64[1] = uDst.au64[1];
14095 puDst->au64[2] = uDst.au64[2];
14096 puDst->au64[3] = uDst.au64[3];
14097}
14098
14099
14100/*
14101 * PHSUBSW / VPHSUBSW
14102 */
14103IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14104{
14105 RTUINT64U uSrc1 = { *puDst };
14106 RTUINT64U uSrc2 = { *puSrc };
14107 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14108
14109 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14110 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14111 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
14112 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
14113 *puDst = uDst.u;
14114 RT_NOREF(pFpuState);
14115}
14116
14117
14118IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14119{
14120 RTUINT128U uSrc1 = *puDst;
14121
14122 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14123 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14124 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
14125 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
14126
14127 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
14128 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
14129 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
14130 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
14131 RT_NOREF(pFpuState);
14132}
14133
14134
14135IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14136{
14137 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14138
14139 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
14140 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
14141 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
14142 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
14143
14144 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
14145 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
14146 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
14147 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
14148
14149 puDst->au64[0] = uDst.au64[0];
14150 puDst->au64[1] = uDst.au64[1];
14151}
14152
14153
14154IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14155{
14156 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14157
14158 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
14159 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
14160 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
14161 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
14162 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
14163 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
14164 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
14165 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
14166
14167 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
14168 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
14169 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
14170 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
14171 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
14172 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
14173 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
14174 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
14175
14176 puDst->au64[0] = uDst.au64[0];
14177 puDst->au64[1] = uDst.au64[1];
14178 puDst->au64[2] = uDst.au64[2];
14179 puDst->au64[3] = uDst.au64[3];
14180}
14181
14182
14183/*
14184 * PMADDUBSW / VPMADDUBSW
14185 */
14186IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14187{
14188 RTUINT64U uSrc1 = { *puDst };
14189 RTUINT64U uSrc2 = { *puSrc };
14190 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14191
14192 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14193 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14194 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14195 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14196 *puDst = uDst.u;
14197 RT_NOREF(pFpuState);
14198}
14199
14200
14201IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14202{
14203 RTUINT128U uSrc1 = *puDst;
14204
14205 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14206 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14207 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14208 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14209 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14210 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14211 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14212 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14213 RT_NOREF(pFpuState);
14214}
14215
14216
14217IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14218{
14219 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14220
14221 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14222 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14223 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14224 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14225 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14226 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14227 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14228 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14229
14230 puDst->au64[0] = uDst.au64[0];
14231 puDst->au64[1] = uDst.au64[1];
14232}
14233
14234
14235IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14236{
14237 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14238
14239 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14240 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14241 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14242 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14243 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14244 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14245 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14246 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14247 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14248 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14249 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14250 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14251 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14252 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14253 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14254 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14255
14256 puDst->au64[0] = uDst.au64[0];
14257 puDst->au64[1] = uDst.au64[1];
14258 puDst->au64[2] = uDst.au64[2];
14259 puDst->au64[3] = uDst.au64[3];
14260}
14261
14262
14263/*
14264 * PMULHRSW / VPMULHRSW
14265 */
14266#define DO_PMULHRSW(a_Src1, a_Src2) \
14267 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14268
14269IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14270{
14271 RTUINT64U uSrc1 = { *puDst };
14272 RTUINT64U uSrc2 = { *puSrc };
14273 RTUINT64U uDst;
14274
14275 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14276 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14277 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14278 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14279 *puDst = uDst.u;
14280 RT_NOREF(pFpuState);
14281}
14282
14283
14284IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14285{
14286 RTUINT128U uSrc1 = *puDst;
14287
14288 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14289 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14290 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14291 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14292 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14293 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14294 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14295 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14296 RT_NOREF(pFpuState);
14297}
14298
14299
14300IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14301{
14302 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14303
14304 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14305 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14306 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14307 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14308 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14309 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14310 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14311 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14312
14313 puDst->au64[0] = uDst.au64[0];
14314 puDst->au64[1] = uDst.au64[1];
14315}
14316
14317
14318IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14319{
14320 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14321
14322 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14323 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14324 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14325 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14326 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14327 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14328 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14329 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14330 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14331 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14332 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14333 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14334 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14335 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14336 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14337 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14338
14339 puDst->au64[0] = uDst.au64[0];
14340 puDst->au64[1] = uDst.au64[1];
14341 puDst->au64[2] = uDst.au64[2];
14342 puDst->au64[3] = uDst.au64[3];
14343}
14344
14345
14346/*
14347 * PSADBW / VPSADBW
14348 */
14349#ifdef IEM_WITHOUT_ASSEMBLY
14350
14351IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14352{
14353 RTUINT64U uSrc1 = { *puDst };
14354 RTUINT64U uSrc2 = { *puSrc };
14355 RTUINT64U uDst;
14356 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14357 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14358 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14359 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14360 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14361 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14362 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14363 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14364
14365 uDst.au64[0] = 0;
14366 uDst.au16[0] = uSum;
14367 *puDst = uDst.u;
14368}
14369
14370
14371IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14372{
14373 RTUINT128U uSrc1 = *puDst;
14374
14375 puDst->au64[0] = 0;
14376 puDst->au64[1] = 0;
14377
14378 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14379 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14380 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14381 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14382 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14383 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14384 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14385 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14386 puDst->au16[0] = uSum;
14387
14388 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14389 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14390 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14391 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14392 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14393 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14394 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14395 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14396 puDst->au16[4] = uSum;
14397}
14398
14399#endif
14400
14401IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14402{
14403 RTUINT128U uSrc1 = *puSrc1;
14404 RTUINT128U uSrc2 = *puSrc2;
14405
14406 puDst->au64[0] = 0;
14407 puDst->au64[1] = 0;
14408
14409 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14410 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14411 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14412 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14413 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14414 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14415 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14416 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14417 puDst->au16[0] = uSum;
14418
14419 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14420 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14421 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14422 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14423 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14424 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14425 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14426 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14427 puDst->au16[4] = uSum;
14428}
14429
14430IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14431{
14432 RTUINT256U uSrc1 = *puSrc1;
14433 RTUINT256U uSrc2 = *puSrc2;
14434
14435 puDst->au64[0] = 0;
14436 puDst->au64[1] = 0;
14437 puDst->au64[2] = 0;
14438 puDst->au64[3] = 0;
14439
14440 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14441 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14442 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14443 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14444 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14445 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14446 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14447 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14448 puDst->au16[0] = uSum;
14449
14450 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14451 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14452 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14453 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14454 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14455 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14456 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14457 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14458 puDst->au16[4] = uSum;
14459
14460 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14461 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14462 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14463 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14464 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14465 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14466 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14467 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14468 puDst->au16[8] = uSum;
14469
14470 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14471 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14472 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14473 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14474 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14475 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14476 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14477 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14478 puDst->au16[12] = uSum;
14479}
14480
14481
14482/*
14483 * PMULDQ / VPMULDQ
14484 */
14485IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14486{
14487 RTUINT128U uSrc1 = *puDst;
14488
14489 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14490 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14491}
14492
14493IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14494{
14495 RTUINT128U uSrc1 = *puSrc1;
14496 RTUINT128U uSrc2 = *puSrc2;
14497
14498 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14499 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14500}
14501
14502IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14503{
14504 RTUINT256U uSrc1 = *puSrc1;
14505 RTUINT256U uSrc2 = *puSrc2;
14506
14507 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14508 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14509 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14510 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14511}
14512
14513
14514/*
14515 * PMULUDQ / VPMULUDQ
14516 */
14517#ifdef IEM_WITHOUT_ASSEMBLY
14518
14519IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14520{
14521 RTUINT64U uSrc1 = { *puDst };
14522 RTUINT64U uSrc2 = { *puSrc };
14523 ASMCompilerBarrier();
14524 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14525 RT_NOREF(pFpuState);
14526}
14527
14528
14529IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14530{
14531 RTUINT128U uSrc1 = *puDst;
14532 RTUINT128U uSrc2 = *puSrc;
14533 ASMCompilerBarrier();
14534 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14535 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14536 RT_NOREF(pFpuState);
14537}
14538
14539#endif
14540
14541IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14542{
14543 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14544 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14545 ASMCompilerBarrier();
14546 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14547 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14548}
14549
14550
14551IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14552{
14553 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14554 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14555 ASMCompilerBarrier();
14556 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14557 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14558 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14559 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14560}
14561
14562
14563/*
14564 * UNPCKLPS / VUNPCKLPS
14565 */
14566#ifdef IEM_WITHOUT_ASSEMBLY
14567IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14568{
14569 RTUINT128U uSrc1 = *puDst;
14570 RTUINT128U uSrc2 = *puSrc;
14571 ASMCompilerBarrier();
14572 puDst->au32[0] = uSrc1.au32[0];
14573 puDst->au32[1] = uSrc2.au32[0];
14574 puDst->au32[2] = uSrc1.au32[1];
14575 puDst->au32[3] = uSrc2.au32[1];
14576}
14577
14578#endif
14579
14580IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14581{
14582 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14583 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14584 ASMCompilerBarrier();
14585 puDst->au32[0] = uSrc1.au32[0];
14586 puDst->au32[1] = uSrc2.au32[0];
14587 puDst->au32[2] = uSrc1.au32[1];
14588 puDst->au32[3] = uSrc2.au32[1];
14589}
14590
14591
14592IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14593{
14594 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14595 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14596 ASMCompilerBarrier();
14597 puDst->au32[0] = uSrc1.au32[0];
14598 puDst->au32[1] = uSrc2.au32[0];
14599 puDst->au32[2] = uSrc1.au32[1];
14600 puDst->au32[3] = uSrc2.au32[1];
14601
14602 puDst->au32[4] = uSrc1.au32[4];
14603 puDst->au32[5] = uSrc2.au32[4];
14604 puDst->au32[6] = uSrc1.au32[5];
14605 puDst->au32[7] = uSrc2.au32[5];
14606}
14607
14608
14609/*
14610 * UNPCKLPD / VUNPCKLPD
14611 */
14612#ifdef IEM_WITHOUT_ASSEMBLY
14613IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14614{
14615 RTUINT128U uSrc1 = *puDst;
14616 RTUINT128U uSrc2 = *puSrc;
14617 ASMCompilerBarrier();
14618 puDst->au64[0] = uSrc1.au64[0];
14619 puDst->au64[1] = uSrc2.au64[0];
14620}
14621
14622#endif
14623
14624IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14625{
14626 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14627 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14628 ASMCompilerBarrier();
14629 puDst->au64[0] = uSrc1.au64[0];
14630 puDst->au64[1] = uSrc2.au64[0];
14631}
14632
14633
14634IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14635{
14636 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14637 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14638 ASMCompilerBarrier();
14639 puDst->au64[0] = uSrc1.au64[0];
14640 puDst->au64[1] = uSrc2.au64[0];
14641 puDst->au64[2] = uSrc1.au64[2];
14642 puDst->au64[3] = uSrc2.au64[2];
14643}
14644
14645
14646/*
14647 * UNPCKHPS / VUNPCKHPS
14648 */
14649#ifdef IEM_WITHOUT_ASSEMBLY
14650IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14651{
14652 RTUINT128U uSrc1 = *puDst;
14653 RTUINT128U uSrc2 = *puSrc;
14654 ASMCompilerBarrier();
14655 puDst->au32[0] = uSrc1.au32[2];
14656 puDst->au32[1] = uSrc2.au32[2];
14657 puDst->au32[2] = uSrc1.au32[3];
14658 puDst->au32[3] = uSrc2.au32[3];
14659}
14660
14661#endif
14662
14663IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14664{
14665 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14666 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14667 ASMCompilerBarrier();
14668 puDst->au32[0] = uSrc1.au32[2];
14669 puDst->au32[1] = uSrc2.au32[2];
14670 puDst->au32[2] = uSrc1.au32[3];
14671 puDst->au32[3] = uSrc2.au32[3];
14672}
14673
14674
14675IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14676{
14677 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14678 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14679 ASMCompilerBarrier();
14680 puDst->au32[0] = uSrc1.au32[2];
14681 puDst->au32[1] = uSrc2.au32[2];
14682 puDst->au32[2] = uSrc1.au32[3];
14683 puDst->au32[3] = uSrc2.au32[3];
14684
14685 puDst->au32[4] = uSrc1.au32[6];
14686 puDst->au32[5] = uSrc2.au32[6];
14687 puDst->au32[6] = uSrc1.au32[7];
14688 puDst->au32[7] = uSrc2.au32[7];
14689}
14690
14691
14692/*
14693 * UNPCKHPD / VUNPCKHPD
14694 */
14695#ifdef IEM_WITHOUT_ASSEMBLY
14696IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14697{
14698 RTUINT128U uSrc1 = *puDst;
14699 RTUINT128U uSrc2 = *puSrc;
14700 ASMCompilerBarrier();
14701 puDst->au64[0] = uSrc1.au64[1];
14702 puDst->au64[1] = uSrc2.au64[1];
14703}
14704
14705#endif
14706
14707IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14708{
14709 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14710 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14711 ASMCompilerBarrier();
14712 puDst->au64[0] = uSrc1.au64[1];
14713 puDst->au64[1] = uSrc2.au64[1];
14714}
14715
14716
14717IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14718{
14719 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14720 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14721 ASMCompilerBarrier();
14722 puDst->au64[0] = uSrc1.au64[1];
14723 puDst->au64[1] = uSrc2.au64[1];
14724 puDst->au64[2] = uSrc1.au64[3];
14725 puDst->au64[3] = uSrc2.au64[3];
14726}
14727
14728
14729/*
14730 * CRC32 (SEE 4.2).
14731 */
14732
14733IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14734{
14735 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14736}
14737
14738
14739IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14740{
14741 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14742}
14743
14744IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14745{
14746 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14747}
14748
14749IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14750{
14751 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14752}
14753
14754
14755/*
14756 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14757 */
14758#ifdef IEM_WITHOUT_ASSEMBLY
14759IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14760{
14761 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14762 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14763 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14764 fEfl |= X86_EFL_ZF;
14765 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14766 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14767 fEfl |= X86_EFL_CF;
14768 *pfEFlags = fEfl;
14769}
14770#endif
14771
14772IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14773{
14774 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14775 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14776 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14777 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14778 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14779 fEfl |= X86_EFL_ZF;
14780 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14781 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14782 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14783 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14784 fEfl |= X86_EFL_CF;
14785 *pfEFlags = fEfl;
14786}
14787
14788
14789/*
14790 * PMOVSXBW / VPMOVSXBW
14791 */
14792IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14793{
14794 RTUINT64U uSrc1 = { uSrc };
14795 puDst->ai16[0] = uSrc1.ai8[0];
14796 puDst->ai16[1] = uSrc1.ai8[1];
14797 puDst->ai16[2] = uSrc1.ai8[2];
14798 puDst->ai16[3] = uSrc1.ai8[3];
14799 puDst->ai16[4] = uSrc1.ai8[4];
14800 puDst->ai16[5] = uSrc1.ai8[5];
14801 puDst->ai16[6] = uSrc1.ai8[6];
14802 puDst->ai16[7] = uSrc1.ai8[7];
14803}
14804
14805
14806IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14807{
14808 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14809 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14810 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14811 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14812 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14813 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14814 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14815 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14816 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14817 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14818 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14819 puDst->ai16[10] = uSrc1.ai8[10];
14820 puDst->ai16[11] = uSrc1.ai8[11];
14821 puDst->ai16[12] = uSrc1.ai8[12];
14822 puDst->ai16[13] = uSrc1.ai8[13];
14823 puDst->ai16[14] = uSrc1.ai8[14];
14824 puDst->ai16[15] = uSrc1.ai8[15];
14825}
14826
14827
14828/*
14829 * PMOVSXBD / VPMOVSXBD
14830 */
14831IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14832{
14833 RTUINT32U uSrc1 = { uSrc };
14834 puDst->ai32[0] = uSrc1.ai8[0];
14835 puDst->ai32[1] = uSrc1.ai8[1];
14836 puDst->ai32[2] = uSrc1.ai8[2];
14837 puDst->ai32[3] = uSrc1.ai8[3];
14838}
14839
14840
14841IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14842{
14843 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14844 puDst->ai32[0] = uSrc1.ai8[0];
14845 puDst->ai32[1] = uSrc1.ai8[1];
14846 puDst->ai32[2] = uSrc1.ai8[2];
14847 puDst->ai32[3] = uSrc1.ai8[3];
14848 puDst->ai32[4] = uSrc1.ai8[4];
14849 puDst->ai32[5] = uSrc1.ai8[5];
14850 puDst->ai32[6] = uSrc1.ai8[6];
14851 puDst->ai32[7] = uSrc1.ai8[7];
14852}
14853
14854
14855/*
14856 * PMOVSXBQ / VPMOVSXBQ
14857 */
14858IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14859{
14860 RTUINT16U uSrc1 = { uSrc };
14861 puDst->ai64[0] = uSrc1.ai8[0];
14862 puDst->ai64[1] = uSrc1.ai8[1];
14863}
14864
14865
14866IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14867{
14868 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14869 puDst->ai64[0] = uSrc1.ai8[0];
14870 puDst->ai64[1] = uSrc1.ai8[1];
14871 puDst->ai64[2] = uSrc1.ai8[2];
14872 puDst->ai64[3] = uSrc1.ai8[3];
14873}
14874
14875
14876/*
14877 * PMOVSXWD / VPMOVSXWD
14878 */
14879IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14880{
14881 RTUINT64U uSrc1 = { uSrc };
14882 puDst->ai32[0] = uSrc1.ai16[0];
14883 puDst->ai32[1] = uSrc1.ai16[1];
14884 puDst->ai32[2] = uSrc1.ai16[2];
14885 puDst->ai32[3] = uSrc1.ai16[3];
14886}
14887
14888
14889IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14890{
14891 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14892 puDst->ai32[0] = uSrc1.ai16[0];
14893 puDst->ai32[1] = uSrc1.ai16[1];
14894 puDst->ai32[2] = uSrc1.ai16[2];
14895 puDst->ai32[3] = uSrc1.ai16[3];
14896 puDst->ai32[4] = uSrc1.ai16[4];
14897 puDst->ai32[5] = uSrc1.ai16[5];
14898 puDst->ai32[6] = uSrc1.ai16[6];
14899 puDst->ai32[7] = uSrc1.ai16[7];
14900}
14901
14902
14903/*
14904 * PMOVSXWQ / VPMOVSXWQ
14905 */
14906IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14907{
14908 RTUINT32U uSrc1 = { uSrc };
14909 puDst->ai64[0] = uSrc1.ai16[0];
14910 puDst->ai64[1] = uSrc1.ai16[1];
14911}
14912
14913
14914IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14915{
14916 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14917 puDst->ai64[0] = uSrc1.ai16[0];
14918 puDst->ai64[1] = uSrc1.ai16[1];
14919 puDst->ai64[2] = uSrc1.ai16[2];
14920 puDst->ai64[3] = uSrc1.ai16[3];
14921}
14922
14923
14924/*
14925 * PMOVSXDQ / VPMOVSXDQ
14926 */
14927IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14928{
14929 RTUINT64U uSrc1 = { uSrc };
14930 puDst->ai64[0] = uSrc1.ai32[0];
14931 puDst->ai64[1] = uSrc1.ai32[1];
14932}
14933
14934
14935IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14936{
14937 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14938 puDst->ai64[0] = uSrc1.ai32[0];
14939 puDst->ai64[1] = uSrc1.ai32[1];
14940 puDst->ai64[2] = uSrc1.ai32[2];
14941 puDst->ai64[3] = uSrc1.ai32[3];
14942}
14943
14944
14945/*
14946 * PMOVZXBW / VPMOVZXBW
14947 */
14948IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14949{
14950 RTUINT64U uSrc1 = { uSrc };
14951 puDst->au16[0] = uSrc1.au8[0];
14952 puDst->au16[1] = uSrc1.au8[1];
14953 puDst->au16[2] = uSrc1.au8[2];
14954 puDst->au16[3] = uSrc1.au8[3];
14955 puDst->au16[4] = uSrc1.au8[4];
14956 puDst->au16[5] = uSrc1.au8[5];
14957 puDst->au16[6] = uSrc1.au8[6];
14958 puDst->au16[7] = uSrc1.au8[7];
14959}
14960
14961
14962IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14963{
14964 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14965 puDst->au16[ 0] = uSrc1.au8[ 0];
14966 puDst->au16[ 1] = uSrc1.au8[ 1];
14967 puDst->au16[ 2] = uSrc1.au8[ 2];
14968 puDst->au16[ 3] = uSrc1.au8[ 3];
14969 puDst->au16[ 4] = uSrc1.au8[ 4];
14970 puDst->au16[ 5] = uSrc1.au8[ 5];
14971 puDst->au16[ 6] = uSrc1.au8[ 6];
14972 puDst->au16[ 7] = uSrc1.au8[ 7];
14973 puDst->au16[ 8] = uSrc1.au8[ 8];
14974 puDst->au16[ 9] = uSrc1.au8[ 9];
14975 puDst->au16[10] = uSrc1.au8[10];
14976 puDst->au16[11] = uSrc1.au8[11];
14977 puDst->au16[12] = uSrc1.au8[12];
14978 puDst->au16[13] = uSrc1.au8[13];
14979 puDst->au16[14] = uSrc1.au8[14];
14980 puDst->au16[15] = uSrc1.au8[15];
14981}
14982
14983
14984/*
14985 * PMOVZXBD / VPMOVZXBD
14986 */
14987IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14988{
14989 RTUINT32U uSrc1 = { uSrc };
14990 puDst->au32[0] = uSrc1.au8[0];
14991 puDst->au32[1] = uSrc1.au8[1];
14992 puDst->au32[2] = uSrc1.au8[2];
14993 puDst->au32[3] = uSrc1.au8[3];
14994}
14995
14996
14997IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14998{
14999 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15000 puDst->au32[0] = uSrc1.au8[0];
15001 puDst->au32[1] = uSrc1.au8[1];
15002 puDst->au32[2] = uSrc1.au8[2];
15003 puDst->au32[3] = uSrc1.au8[3];
15004 puDst->au32[4] = uSrc1.au8[4];
15005 puDst->au32[5] = uSrc1.au8[5];
15006 puDst->au32[6] = uSrc1.au8[6];
15007 puDst->au32[7] = uSrc1.au8[7];
15008}
15009
15010
15011/*
15012 * PMOVZXBQ / VPMOVZXBQ
15013 */
15014IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15015{
15016 RTUINT16U uSrc1 = { uSrc };
15017 puDst->au64[0] = uSrc1.au8[0];
15018 puDst->au64[1] = uSrc1.au8[1];
15019}
15020
15021
15022IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15023{
15024 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15025 puDst->au64[0] = uSrc1.au8[0];
15026 puDst->au64[1] = uSrc1.au8[1];
15027 puDst->au64[2] = uSrc1.au8[2];
15028 puDst->au64[3] = uSrc1.au8[3];
15029}
15030
15031
15032/*
15033 * PMOVZXWD / VPMOVZXWD
15034 */
15035IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15036{
15037 RTUINT64U uSrc1 = { uSrc };
15038 puDst->au32[0] = uSrc1.au16[0];
15039 puDst->au32[1] = uSrc1.au16[1];
15040 puDst->au32[2] = uSrc1.au16[2];
15041 puDst->au32[3] = uSrc1.au16[3];
15042}
15043
15044
15045IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15046{
15047 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15048 puDst->au32[0] = uSrc1.au16[0];
15049 puDst->au32[1] = uSrc1.au16[1];
15050 puDst->au32[2] = uSrc1.au16[2];
15051 puDst->au32[3] = uSrc1.au16[3];
15052 puDst->au32[4] = uSrc1.au16[4];
15053 puDst->au32[5] = uSrc1.au16[5];
15054 puDst->au32[6] = uSrc1.au16[6];
15055 puDst->au32[7] = uSrc1.au16[7];
15056}
15057
15058
15059/*
15060 * PMOVZXWQ / VPMOVZXWQ
15061 */
15062IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15063{
15064 RTUINT32U uSrc1 = { uSrc };
15065 puDst->au64[0] = uSrc1.au16[0];
15066 puDst->au64[1] = uSrc1.au16[1];
15067}
15068
15069
15070IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15071{
15072 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15073 puDst->au64[0] = uSrc1.au16[0];
15074 puDst->au64[1] = uSrc1.au16[1];
15075 puDst->au64[2] = uSrc1.au16[2];
15076 puDst->au64[3] = uSrc1.au16[3];
15077}
15078
15079
15080/*
15081 * PMOVZXDQ / VPMOVZXDQ
15082 */
15083IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15084{
15085 RTUINT64U uSrc1 = { uSrc };
15086 puDst->au64[0] = uSrc1.au32[0];
15087 puDst->au64[1] = uSrc1.au32[1];
15088}
15089
15090
15091IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15092{
15093 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15094 puDst->au64[0] = uSrc1.au32[0];
15095 puDst->au64[1] = uSrc1.au32[1];
15096 puDst->au64[2] = uSrc1.au32[2];
15097 puDst->au64[3] = uSrc1.au32[3];
15098}
15099
15100/**
15101 * Converts from the packed IPRT 32-bit (single precision) floating point format to
15102 * the SoftFloat 32-bit floating point format (float32_t).
15103 *
15104 * This is only a structure format conversion, nothing else.
15105 */
15106DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
15107{
15108 float32_t Tmp;
15109 Tmp.v = pr32Val->u;
15110 return Tmp;
15111}
15112
15113
15114/**
15115 * Converts from SoftFloat 32-bit floating point format (float32_t)
15116 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
15117 *
15118 * This is only a structure format conversion, nothing else.
15119 */
15120DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
15121{
15122 pr32Dst->u = r32XSrc.v;
15123 return pr32Dst;
15124}
15125
15126
15127/**
15128 * Converts from the packed IPRT 64-bit (single precision) floating point format to
15129 * the SoftFloat 64-bit floating point format (float64_t).
15130 *
15131 * This is only a structure format conversion, nothing else.
15132 */
15133DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
15134{
15135 float64_t Tmp;
15136 Tmp.v = pr64Val->u;
15137 return Tmp;
15138}
15139
15140
15141/**
15142 * Converts from SoftFloat 64-bit floating point format (float64_t)
15143 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
15144 *
15145 * This is only a structure format conversion, nothing else.
15146 */
15147DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
15148{
15149 pr64Dst->u = r64XSrc.v;
15150 return pr64Dst;
15151}
15152
15153
15154/** Initializer for the SoftFloat state structure. */
15155# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
15156 { \
15157 softfloat_tininess_afterRounding, \
15158 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
15159 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
15160 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
15161 : (uint8_t)softfloat_round_minMag, \
15162 0, \
15163 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
15164 32 /* Rounding precision, not relevant for SIMD. */ \
15165 }
15166
15167#ifdef IEM_WITHOUT_ASSEMBLY
15168
15169/**
15170 * Helper for transfering exception to MXCSR and setting the result value
15171 * accordingly.
15172 *
15173 * @returns Updated MXCSR.
15174 * @param pSoftState The SoftFloat state following the operation.
15175 * @param r32Result The result of the SoftFloat operation.
15176 * @param pr32Result Where to store the result for IEM.
15177 * @param fMxcsr The original MXCSR value.
15178 */
15179DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
15180 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15181{
15182 iemFpSoftF32ToIprt(pr32Result, r32Result);
15183
15184 uint8_t fXcpt = pSoftState->exceptionFlags;
15185 if ( (fMxcsr & X86_MXCSR_FZ)
15186 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
15187 {
15188 /* Underflow masked and flush to zero is set. */
15189 pr32Result->s.uFraction = 0;
15190 pr32Result->s.uExponent = 0;
15191 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15192 }
15193
15194 /* If DAZ is set \#DE is never set. */
15195 if ( fMxcsr & X86_MXCSR_DAZ
15196 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15197 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15198 fXcpt &= ~X86_MXCSR_DE;
15199
15200 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15201}
15202
15203
15204/**
15205 * Helper for transfering exception to MXCSR and setting the result value
15206 * accordingly - ignores Flush-to-Zero.
15207 *
15208 * @returns Updated MXCSR.
15209 * @param pSoftState The SoftFloat state following the operation.
15210 * @param r32Result The result of the SoftFloat operation.
15211 * @param pr32Result Where to store the result for IEM.
15212 * @param fMxcsr The original MXCSR value.
15213 */
15214DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15215 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15216{
15217 iemFpSoftF32ToIprt(pr32Result, r32Result);
15218
15219 uint8_t fXcpt = pSoftState->exceptionFlags;
15220 /* If DAZ is set \#DE is never set. */
15221 if ( fMxcsr & X86_MXCSR_DAZ
15222 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15223 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15224 fXcpt &= ~X86_MXCSR_DE;
15225
15226 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15227}
15228
15229
15230/**
15231 * Helper for transfering exception to MXCSR and setting the result value
15232 * accordingly.
15233 *
15234 * @returns Updated MXCSR.
15235 * @param pSoftState The SoftFloat state following the operation.
15236 * @param r64Result The result of the SoftFloat operation.
15237 * @param pr64Result Where to store the result for IEM.
15238 * @param fMxcsr The original MXCSR value.
15239 */
15240DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15241 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15242{
15243 iemFpSoftF64ToIprt(pr64Result, r64Result);
15244 uint8_t fXcpt = pSoftState->exceptionFlags;
15245 if ( (fMxcsr & X86_MXCSR_FZ)
15246 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15247 {
15248 /* Underflow masked and flush to zero is set. */
15249 iemFpSoftF64ToIprt(pr64Result, r64Result);
15250 pr64Result->s.uFractionHigh = 0;
15251 pr64Result->s.uFractionLow = 0;
15252 pr64Result->s.uExponent = 0;
15253 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15254 }
15255
15256 /* If DAZ is set \#DE is never set. */
15257 if ( fMxcsr & X86_MXCSR_DAZ
15258 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15259 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15260 fXcpt &= ~X86_MXCSR_DE;
15261
15262 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15263}
15264
15265
15266/**
15267 * Helper for transfering exception to MXCSR and setting the result value
15268 * accordingly - ignores Flush-to-Zero.
15269 *
15270 * @returns Updated MXCSR.
15271 * @param pSoftState The SoftFloat state following the operation.
15272 * @param r64Result The result of the SoftFloat operation.
15273 * @param pr64Result Where to store the result for IEM.
15274 * @param fMxcsr The original MXCSR value.
15275 */
15276DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15277 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15278{
15279 iemFpSoftF64ToIprt(pr64Result, r64Result);
15280
15281 uint8_t fXcpt = pSoftState->exceptionFlags;
15282 /* If DAZ is set \#DE is never set. */
15283 if ( fMxcsr & X86_MXCSR_DAZ
15284 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15285 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15286 fXcpt &= ~X86_MXCSR_DE;
15287
15288 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15289}
15290
15291#endif /* IEM_WITHOUT_ASSEMBLY */
15292
15293
15294/**
15295 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15296 * in MXCSR into account.
15297 *
15298 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15299 * @param pr32Val Where to store the result.
15300 * @param fMxcsr The input MXCSR value.
15301 * @param pr32Src The value to use.
15302 */
15303DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15304{
15305 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15306 {
15307 if (fMxcsr & X86_MXCSR_DAZ)
15308 {
15309 /* De-normals are changed to 0. */
15310 pr32Val->s.fSign = pr32Src->s.fSign;
15311 pr32Val->s.uFraction = 0;
15312 pr32Val->s.uExponent = 0;
15313 return 0;
15314 }
15315
15316 *pr32Val = *pr32Src;
15317 return X86_MXCSR_DE;
15318 }
15319
15320 *pr32Val = *pr32Src;
15321 return 0;
15322}
15323
15324
15325/**
15326 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15327 * in MXCSR into account.
15328 *
15329 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15330 * @param pr64Val Where to store the result.
15331 * @param fMxcsr The input MXCSR value.
15332 * @param pr64Src The value to use.
15333 */
15334DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15335{
15336 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15337 {
15338 if (fMxcsr & X86_MXCSR_DAZ)
15339 {
15340 /* De-normals are changed to 0. */
15341 pr64Val->s64.fSign = pr64Src->s.fSign;
15342 pr64Val->s64.uFraction = 0;
15343 pr64Val->s64.uExponent = 0;
15344 return 0;
15345 }
15346
15347 *pr64Val = *pr64Src;
15348 return X86_MXCSR_DE;
15349 }
15350
15351 *pr64Val = *pr64Src;
15352 return 0;
15353}
15354
15355#ifdef IEM_WITHOUT_ASSEMBLY
15356
15357/**
15358 * Validates the given input operands returning whether the operation can continue or whether one
15359 * of the source operands contains a NaN value, setting the output accordingly.
15360 *
15361 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15362 * @param pr32Res Where to store the result in case the operation can't continue.
15363 * @param pr32Val1 The first input operand.
15364 * @param pr32Val2 The second input operand.
15365 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15366 */
15367DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15368{
15369 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15370 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15371 if (cSNan + cQNan == 2)
15372 {
15373 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15374 *pr32Res = *pr32Val1;
15375 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15376 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15377 return true;
15378 }
15379 if (cSNan)
15380 {
15381 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15382 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15383 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15384 *pfMxcsr |= X86_MXCSR_IE;
15385 return true;
15386 }
15387 if (cQNan)
15388 {
15389 /* The QNan operand is placed into the result. */
15390 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15391 return true;
15392 }
15393
15394 Assert(!cQNan && !cSNan);
15395 return false;
15396}
15397
15398
15399/**
15400 * Validates the given double precision input operands returning whether the operation can continue or whether one
15401 * of the source operands contains a NaN value, setting the output accordingly.
15402 *
15403 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15404 * @param pr64Res Where to store the result in case the operation can't continue.
15405 * @param pr64Val1 The first input operand.
15406 * @param pr64Val2 The second input operand.
15407 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15408 */
15409DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15410{
15411 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15412 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15413 if (cSNan + cQNan == 2)
15414 {
15415 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15416 *pr64Res = *pr64Val1;
15417 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15418 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15419 return true;
15420 }
15421 if (cSNan)
15422 {
15423 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15424 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15425 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15426 *pfMxcsr |= X86_MXCSR_IE;
15427 return true;
15428 }
15429 if (cQNan)
15430 {
15431 /* The QNan operand is placed into the result. */
15432 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15433 return true;
15434 }
15435
15436 Assert(!cQNan && !cSNan);
15437 return false;
15438}
15439
15440
15441/**
15442 * Validates the given single input operand returning whether the operation can continue or whether
15443 * contains a NaN value, setting the output accordingly.
15444 *
15445 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15446 * @param pr32Res Where to store the result in case the operation can't continue.
15447 * @param pr32Val The input operand.
15448 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15449 */
15450DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15451{
15452 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15453 {
15454 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15455 *pr32Res = *pr32Val;
15456 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15457 *pfMxcsr |= X86_MXCSR_IE;
15458 return true;
15459 }
15460 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15461 {
15462 /* The QNan operand is placed into the result. */
15463 *pr32Res = *pr32Val;
15464 return true;
15465 }
15466
15467 return false;
15468}
15469
15470
15471/**
15472 * Validates the given double input operand returning whether the operation can continue or whether
15473 * contains a NaN value, setting the output accordingly.
15474 *
15475 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15476 * @param pr64Res Where to store the result in case the operation can't continue.
15477 * @param pr64Val The input operand.
15478 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15479 */
15480DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15481{
15482 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15483 {
15484 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15485 *pr64Res = *pr64Val;
15486 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15487 *pfMxcsr |= X86_MXCSR_IE;
15488 return true;
15489 }
15490 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15491 {
15492 /* The QNan operand is placed into the result. */
15493 *pr64Res = *pr64Val;
15494 return true;
15495 }
15496
15497 return false;
15498}
15499
15500#endif /* IEM_WITHOUT_ASSEMBLY */
15501
15502/**
15503 * ADDPS
15504 */
15505#ifdef IEM_WITHOUT_ASSEMBLY
15506static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15507{
15508 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15509 return fMxcsr;
15510
15511 RTFLOAT32U r32Src1, r32Src2;
15512 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15513 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15514 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15515 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15516 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15517}
15518
15519
15520IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15521{
15522 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15523 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15524 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15525 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15526}
15527#endif
15528
15529
15530/**
15531 * ADDSS
15532 */
15533#ifdef IEM_WITHOUT_ASSEMBLY
15534IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15535{
15536 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15537 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15538 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15539 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15540}
15541#endif
15542
15543
15544/**
15545 * ADDPD
15546 */
15547#ifdef IEM_WITHOUT_ASSEMBLY
15548static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15549{
15550 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15551 return fMxcsr;
15552
15553 RTFLOAT64U r64Src1, r64Src2;
15554 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15555 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15556 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15557 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15558 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15559}
15560
15561
15562IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15563{
15564 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15565 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15566}
15567#endif
15568
15569
15570/**
15571 * ADDSD
15572 */
15573#ifdef IEM_WITHOUT_ASSEMBLY
15574IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15575{
15576 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15577 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15578}
15579#endif
15580
15581
15582/**
15583 * MULPS
15584 */
15585#ifdef IEM_WITHOUT_ASSEMBLY
15586static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15587{
15588 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15589 return fMxcsr;
15590
15591 RTFLOAT32U r32Src1, r32Src2;
15592 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15593 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15594 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15595 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15596 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15597}
15598
15599
15600IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15601{
15602 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15603 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15604 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15605 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15606}
15607#endif
15608
15609
15610/**
15611 * MULSS
15612 */
15613#ifdef IEM_WITHOUT_ASSEMBLY
15614IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15615{
15616 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15617 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15618 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15619 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15620}
15621#endif
15622
15623
15624/**
15625 * MULPD
15626 */
15627#ifdef IEM_WITHOUT_ASSEMBLY
15628static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15629{
15630 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15631 return fMxcsr;
15632
15633 RTFLOAT64U r64Src1, r64Src2;
15634 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15635 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15636 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15637 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15638 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15639}
15640
15641
15642IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15643{
15644 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15645 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15646}
15647#endif
15648
15649
15650/**
15651 * MULSD
15652 */
15653#ifdef IEM_WITHOUT_ASSEMBLY
15654IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15655{
15656 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15657 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15658}
15659#endif
15660
15661
15662/**
15663 * SUBPS
15664 */
15665#ifdef IEM_WITHOUT_ASSEMBLY
15666static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15667{
15668 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15669 return fMxcsr;
15670
15671 RTFLOAT32U r32Src1, r32Src2;
15672 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15673 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15674 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15675 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15676 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15677}
15678
15679
15680IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15681{
15682 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15683 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15684 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15685 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15686}
15687#endif
15688
15689
15690/**
15691 * SUBSS
15692 */
15693#ifdef IEM_WITHOUT_ASSEMBLY
15694IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15695{
15696 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15697 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15698 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15699 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15700}
15701#endif
15702
15703
15704/**
15705 * SUBPD
15706 */
15707#ifdef IEM_WITHOUT_ASSEMBLY
15708static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15709{
15710 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15711 return fMxcsr;
15712
15713 RTFLOAT64U r64Src1, r64Src2;
15714 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15715 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15716 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15717 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15718 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15719}
15720
15721
15722IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15723{
15724 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15725 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15726}
15727#endif
15728
15729
15730/**
15731 * SUBSD
15732 */
15733#ifdef IEM_WITHOUT_ASSEMBLY
15734IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15735{
15736 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15737 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15738}
15739#endif
15740
15741
15742/**
15743 * MINPS
15744 */
15745#ifdef IEM_WITHOUT_ASSEMBLY
15746static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15747{
15748 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15749 {
15750 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15751 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15752 return fMxcsr | X86_MXCSR_IE;
15753 }
15754
15755 RTFLOAT32U r32Src1, r32Src2;
15756 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15757 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15758 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15759 {
15760 *pr32Res = r32Src2;
15761 return fMxcsr;
15762 }
15763
15764 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15765 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15766 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15767 fLe
15768 ? iemFpSoftF32FromIprt(&r32Src1)
15769 : iemFpSoftF32FromIprt(&r32Src2),
15770 pr32Res, fMxcsr);
15771}
15772
15773
15774IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15775{
15776 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15777 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15778 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15779 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15780}
15781#endif
15782
15783
15784/**
15785 * MINSS
15786 */
15787#ifdef IEM_WITHOUT_ASSEMBLY
15788IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15789{
15790 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15791 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15792 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15793 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15794}
15795#endif
15796
15797
15798/**
15799 * MINPD
15800 */
15801#ifdef IEM_WITHOUT_ASSEMBLY
15802static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15803{
15804 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15805 {
15806 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15807 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15808 return fMxcsr | X86_MXCSR_IE;
15809 }
15810
15811 RTFLOAT64U r64Src1, r64Src2;
15812 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15813 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15814 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15815 {
15816 *pr64Res = r64Src2;
15817 return fMxcsr;
15818 }
15819
15820 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15821 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15822 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15823 fLe
15824 ? iemFpSoftF64FromIprt(&r64Src1)
15825 : iemFpSoftF64FromIprt(&r64Src2),
15826 pr64Res, fMxcsr);
15827}
15828
15829
15830IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15831{
15832 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15833 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15834}
15835#endif
15836
15837
15838/**
15839 * MINSD
15840 */
15841#ifdef IEM_WITHOUT_ASSEMBLY
15842IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15843{
15844 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15845 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15846}
15847#endif
15848
15849
15850/**
15851 * DIVPS
15852 */
15853#ifdef IEM_WITHOUT_ASSEMBLY
15854static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15855{
15856 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15857 return fMxcsr;
15858
15859 RTFLOAT32U r32Src1, r32Src2;
15860 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15861 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15862 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15863 {
15864 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15865 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15866 {
15867 *pr32Res = g_ar32QNaN[1];
15868 return fMxcsr | X86_MXCSR_IE;
15869 }
15870 else if (RTFLOAT32U_IS_INF(&r32Src1))
15871 {
15872 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15873 return fMxcsr;
15874 }
15875 else
15876 {
15877 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15878 return fMxcsr | X86_MXCSR_ZE;
15879 }
15880 }
15881
15882 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15883 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15884 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15885}
15886
15887
15888IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15889{
15890 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15891 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15892 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15893 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15894}
15895#endif
15896
15897
15898/**
15899 * DIVSS
15900 */
15901#ifdef IEM_WITHOUT_ASSEMBLY
15902IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15903{
15904 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15905 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15906 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15907 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15908}
15909#endif
15910
15911
15912/**
15913 * DIVPD
15914 */
15915#ifdef IEM_WITHOUT_ASSEMBLY
15916static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15917{
15918 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15919 return fMxcsr;
15920
15921 RTFLOAT64U r64Src1, r64Src2;
15922 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15923 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15924 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15925 {
15926 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15927 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15928 {
15929 *pr64Res = g_ar64QNaN[1];
15930 return fMxcsr | X86_MXCSR_IE;
15931 }
15932 else if (RTFLOAT64U_IS_INF(&r64Src1))
15933 {
15934 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15935 return fMxcsr;
15936 }
15937 else
15938 {
15939 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15940 return fMxcsr | X86_MXCSR_ZE;
15941 }
15942 }
15943
15944 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15945 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15946 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15947}
15948
15949
15950IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15951{
15952 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15953 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15954}
15955#endif
15956
15957
15958/**
15959 * DIVSD
15960 */
15961#ifdef IEM_WITHOUT_ASSEMBLY
15962IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15963{
15964 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15965 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15966}
15967#endif
15968
15969
15970/**
15971 * MAXPS
15972 */
15973#ifdef IEM_WITHOUT_ASSEMBLY
15974static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15975{
15976 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15977 {
15978 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15979 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15980 return fMxcsr | X86_MXCSR_IE;
15981 }
15982
15983 RTFLOAT32U r32Src1, r32Src2;
15984 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15985 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15986 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15987 {
15988 *pr32Res = r32Src2;
15989 return fMxcsr;
15990 }
15991
15992 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15993 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15994 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15995 fLe
15996 ? iemFpSoftF32FromIprt(&r32Src2)
15997 : iemFpSoftF32FromIprt(&r32Src1),
15998 pr32Res, fMxcsr);
15999}
16000
16001
16002IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16003{
16004 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16005 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16006 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16007 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16008}
16009#endif
16010
16011
16012/**
16013 * MAXSS
16014 */
16015#ifdef IEM_WITHOUT_ASSEMBLY
16016IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16017{
16018 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
16019 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16020 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16021 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16022}
16023#endif
16024
16025
16026/**
16027 * MAXPD
16028 */
16029#ifdef IEM_WITHOUT_ASSEMBLY
16030static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16031{
16032 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16033 {
16034 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16035 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16036 return fMxcsr | X86_MXCSR_IE;
16037 }
16038
16039 RTFLOAT64U r64Src1, r64Src2;
16040 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16041 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16042 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16043 {
16044 *pr64Res = r64Src2;
16045 return fMxcsr;
16046 }
16047
16048 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16049 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16050 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16051 fLe
16052 ? iemFpSoftF64FromIprt(&r64Src2)
16053 : iemFpSoftF64FromIprt(&r64Src1),
16054 pr64Res, fMxcsr);
16055}
16056
16057
16058IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16059{
16060 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16061 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16062}
16063#endif
16064
16065
16066/**
16067 * MAXSD
16068 */
16069#ifdef IEM_WITHOUT_ASSEMBLY
16070IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16071{
16072 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
16073 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16074}
16075#endif
16076
16077
16078/**
16079 * CVTSS2SD
16080 */
16081#ifdef IEM_WITHOUT_ASSEMBLY
16082static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16083{
16084 RTFLOAT32U r32Src1;
16085 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16086
16087 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16088 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16089 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16090}
16091
16092
16093IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16094{
16095 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
16096 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16097}
16098#endif
16099
16100
16101/**
16102 * CVTSD2SS
16103 */
16104#ifdef IEM_WITHOUT_ASSEMBLY
16105static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16106{
16107 RTFLOAT64U r64Src1;
16108 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16109
16110 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16111 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16112 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16113}
16114
16115
16116IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16117{
16118 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
16119 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16120 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16121 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16122}
16123#endif
16124
16125
16126/**
16127 * HADDPS
16128 */
16129#ifdef IEM_WITHOUT_ASSEMBLY
16130IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16131{
16132 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
16133 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
16134 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
16135 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16136}
16137#endif
16138
16139
16140/**
16141 * HADDPD
16142 */
16143#ifdef IEM_WITHOUT_ASSEMBLY
16144IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16145{
16146 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
16147 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16148}
16149#endif
16150
16151
16152/**
16153 * HSUBPS
16154 */
16155#ifdef IEM_WITHOUT_ASSEMBLY
16156IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16157{
16158 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
16159 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
16160 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
16161 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16162}
16163#endif
16164
16165
16166/**
16167 * HSUBPD
16168 */
16169#ifdef IEM_WITHOUT_ASSEMBLY
16170IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16171{
16172 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
16173 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16174}
16175#endif
16176
16177
16178/**
16179 * SQRTPS
16180 */
16181#ifdef IEM_WITHOUT_ASSEMBLY
16182static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16183{
16184 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16185 return fMxcsr;
16186
16187 RTFLOAT32U r32Src;
16188 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
16189 if (RTFLOAT32U_IS_ZERO(&r32Src))
16190 {
16191 *pr32Res = r32Src;
16192 return fMxcsr;
16193 }
16194 else if (r32Src.s.fSign)
16195 {
16196 *pr32Res = g_ar32QNaN[1];
16197 return fMxcsr | X86_MXCSR_IE;
16198 }
16199
16200 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16201 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16202 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16203}
16204
16205
16206IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16207{
16208 RT_NOREF(puSrc1);
16209
16210 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16211 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16212 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16213 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16214}
16215#endif
16216
16217
16218/**
16219 * SQRTSS
16220 */
16221#ifdef IEM_WITHOUT_ASSEMBLY
16222IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16223{
16224 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16225 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16226 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16227 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16228}
16229#endif
16230
16231
16232/**
16233 * SQRTPD
16234 */
16235#ifdef IEM_WITHOUT_ASSEMBLY
16236static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16237{
16238 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16239 return fMxcsr;
16240
16241 RTFLOAT64U r64Src;
16242 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16243 if (RTFLOAT64U_IS_ZERO(&r64Src))
16244 {
16245 *pr64Res = r64Src;
16246 return fMxcsr;
16247 }
16248 else if (r64Src.s.fSign)
16249 {
16250 *pr64Res = g_ar64QNaN[1];
16251 return fMxcsr | X86_MXCSR_IE;
16252 }
16253
16254 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16255 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16256 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16257}
16258
16259
16260IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16261{
16262 RT_NOREF(puSrc1);
16263
16264 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16265 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16266}
16267#endif
16268
16269
16270/**
16271 * SQRTSD
16272 */
16273#ifdef IEM_WITHOUT_ASSEMBLY
16274IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16275{
16276 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
16277 pResult->uResult.ar64[1] = puSrc1->ar64[1];
16278}
16279#endif
16280
16281
16282#ifdef IEM_WITHOUT_ASSEMBLY
16283/**
16284 * RSQRTPS
16285 */
16286static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16287{
16288 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16289 return fMxcsr;
16290
16291 RTFLOAT32U r32Src;
16292 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16293 if (RTFLOAT32U_IS_ZERO(&r32Src))
16294 {
16295 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16296 return fMxcsr;
16297 }
16298 else if (r32Src.s.fSign)
16299 {
16300 *pr32Res = g_ar32QNaN[1];
16301 return fMxcsr | X86_MXCSR_IE;
16302 }
16303
16304 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16305 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16306 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16307}
16308
16309
16310IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16311{
16312 RT_NOREF(puSrc1);
16313
16314 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16315 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16316 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16317 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16318}
16319
16320
16321/**
16322 * RSQRTSS
16323 */
16324IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16325{
16326 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16327 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16328 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16329 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16330}
16331#endif
16332
16333
16334/**
16335 * RCPPS
16336 */
16337#ifdef IEM_WITHOUT_ASSEMBLY
16338static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16339{
16340 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16341 return fMxcsr;
16342
16343 RTFLOAT32U r32Src;
16344 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16345 if (RTFLOAT32U_IS_ZERO(&r32Src))
16346 {
16347 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16348 return fMxcsr;
16349 }
16350
16351 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16352 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16353 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16354}
16355
16356
16357IEM_DECL_IMPL_DEF(void, iemAImpl_rcpps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16358{
16359 RT_NOREF(puSrc1);
16360
16361 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16362 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16363 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16364 pResult->MXCSR |= iemAImpl_rcp_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16365}
16366
16367
16368/**
16369 * RCPSS
16370 */
16371IEM_DECL_IMPL_DEF(void, iemAImpl_rcpss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16372{
16373 pResult->MXCSR = iemAImpl_rcp_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
16374 pResult->uResult.ar32[1] = puSrc1->ar32[1];
16375 pResult->uResult.ar32[2] = puSrc1->ar32[2];
16376 pResult->uResult.ar32[3] = puSrc1->ar32[3];
16377}
16378#endif
16379
16380
16381/**
16382 * ADDSUBPS
16383 */
16384#ifdef IEM_WITHOUT_ASSEMBLY
16385IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16386{
16387 RT_NOREF(puSrc1);
16388
16389 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16390 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16391 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16392 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16393}
16394#endif
16395
16396
16397/**
16398 * ADDSUBPD
16399 */
16400#ifdef IEM_WITHOUT_ASSEMBLY
16401IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16402{
16403 RT_NOREF(puSrc1);
16404
16405 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16406 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16407}
16408#endif
16409
16410
16411/**
16412 * CVTPD2PS
16413 */
16414#ifdef IEM_WITHOUT_ASSEMBLY
16415static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16416{
16417 RTFLOAT64U r64Src1;
16418 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16419
16420 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16421 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16422 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16423}
16424
16425
16426IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16427{
16428 RT_NOREF(puSrc1);
16429
16430 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16431 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16432 pResult->uResult.au32[2] = 0;
16433 pResult->uResult.au32[3] = 0;
16434}
16435#endif
16436
16437
16438/**
16439 * CVTPS2PD
16440 */
16441#ifdef IEM_WITHOUT_ASSEMBLY
16442static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16443{
16444 RTFLOAT32U r32Src1;
16445 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16446
16447 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16448 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16449 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16450}
16451
16452
16453IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16454{
16455 RT_NOREF(puSrc1);
16456
16457 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16458 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16459}
16460#endif
16461
16462
16463/**
16464 * CVTDQ2PS
16465 */
16466#ifdef IEM_WITHOUT_ASSEMBLY
16467static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16468{
16469 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16470 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16471 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16472}
16473
16474
16475IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16476{
16477 RT_NOREF(puSrc1);
16478
16479 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16480 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16481 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
16482 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
16483}
16484#endif
16485
16486
16487/**
16488 * CVTPS2DQ
16489 */
16490#ifdef IEM_WITHOUT_ASSEMBLY
16491static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16492{
16493 RTFLOAT32U r32Src;
16494 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16495
16496 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16497 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16498 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16499}
16500
16501
16502IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16503{
16504 RT_NOREF(puSrc1);
16505
16506 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16507 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16508 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16509 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16510}
16511#endif
16512
16513
16514/**
16515 * CVTTPS2DQ
16516 */
16517#ifdef IEM_WITHOUT_ASSEMBLY
16518static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16519{
16520 RTFLOAT32U r32Src;
16521 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16522
16523 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16524 SoftState.roundingMode = softfloat_round_minMag;
16525 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16526 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16527}
16528
16529
16530IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16531{
16532 RT_NOREF(puSrc1);
16533
16534 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16535 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16536 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16537 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16538}
16539#endif
16540
16541
16542/**
16543 * CVTTPD2DQ
16544 */
16545#ifdef IEM_WITHOUT_ASSEMBLY
16546static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16547{
16548 RTFLOAT64U r64Src;
16549 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16550
16551 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16552 SoftState.roundingMode = softfloat_round_minMag;
16553 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16554 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16555}
16556
16557
16558IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16559{
16560 RT_NOREF(puSrc1);
16561
16562 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16563 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16564 pResult->uResult.au64[1] = 0;
16565}
16566#endif
16567
16568
16569/**
16570 * CVTDQ2PD
16571 */
16572#ifdef IEM_WITHOUT_ASSEMBLY
16573static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16574{
16575 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16576 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16577 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16578}
16579
16580
16581IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16582{
16583 RT_NOREF(puSrc1);
16584
16585 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16586 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16587}
16588#endif
16589
16590
16591/**
16592 * CVTPD2DQ
16593 */
16594#ifdef IEM_WITHOUT_ASSEMBLY
16595static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16596{
16597 RTFLOAT64U r64Src;
16598 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16599
16600 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16601 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16602 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16603}
16604
16605
16606IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16607{
16608 RT_NOREF(puSrc1);
16609
16610 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16611 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16612 pResult->uResult.au64[1] = 0;
16613}
16614#endif
16615
16616
16617/**
16618 * [V]SHUFPS
16619 */
16620#ifdef IEM_WITHOUT_ASSEMBLY
16621IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16622{
16623 RTUINT128U const uSrc1 = *puDst;
16624 RTUINT128U const uSrc2 = *puSrc;
16625 ASMCompilerBarrier();
16626 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16627 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16628 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16629 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16630}
16631#endif
16632
16633
16634IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16635{
16636 RTUINT128U const uSrc1 = *puSrc1;
16637 RTUINT128U const uSrc2 = *puSrc2;
16638 ASMCompilerBarrier();
16639 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16640 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16641 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16642 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16643}
16644
16645
16646IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16647{
16648 RTUINT256U const uSrc1 = *puSrc1;
16649 RTUINT256U const uSrc2 = *puSrc2;
16650 ASMCompilerBarrier();
16651 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16652 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16653 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16654 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16655
16656 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
16657 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
16658 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
16659 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
16660}
16661
16662
16663/**
16664 * [V]SHUFPD
16665 */
16666#ifdef IEM_WITHOUT_ASSEMBLY
16667IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16668{
16669 RTUINT128U const uSrc1 = *puDst;
16670 RTUINT128U const uSrc2 = *puSrc;
16671 ASMCompilerBarrier();
16672 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16673 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16674}
16675#endif
16676
16677
16678IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16679{
16680 RTUINT128U const uSrc1 = *puSrc1;
16681 RTUINT128U const uSrc2 = *puSrc2;
16682 ASMCompilerBarrier();
16683 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16684 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16685}
16686
16687
16688IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16689{
16690 RTUINT256U const uSrc1 = *puSrc1;
16691 RTUINT256U const uSrc2 = *puSrc2;
16692 ASMCompilerBarrier();
16693 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16694 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16695 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
16696 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
16697}
16698
16699
16700/*
16701 * PHMINPOSUW / VPHMINPOSUW
16702 */
16703IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16704{
16705 uint16_t u16Min = puSrc->au16[0];
16706 uint8_t idxMin = 0;
16707
16708 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
16709 if (puSrc->au16[i] < u16Min)
16710 {
16711 u16Min = puSrc->au16[i];
16712 idxMin = i;
16713 }
16714
16715 puDst->au64[0] = 0;
16716 puDst->au64[1] = 0;
16717 puDst->au16[0] = u16Min;
16718 puDst->au16[1] = idxMin;
16719}
16720
16721
16722IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16723{
16724 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
16725}
16726
16727
16728/*
16729 * [V]PBLENDVB
16730 */
16731IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16732{
16733 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16734 if (puMask->au8[i] & RT_BIT(7))
16735 puDst->au8[i] = puSrc->au8[i];
16736}
16737
16738
16739IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16740{
16741 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16742 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16743}
16744
16745
16746IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16747{
16748 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16749 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16750}
16751
16752
16753/*
16754 * [V]BLENDVPS
16755 */
16756IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16757{
16758 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16759 if (puMask->au32[i] & RT_BIT_32(31))
16760 puDst->au32[i] = puSrc->au32[i];
16761}
16762
16763
16764IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16765{
16766 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16767 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16768}
16769
16770
16771IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16772{
16773 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16774 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16775}
16776
16777
16778/*
16779 * [V]BLENDVPD
16780 */
16781IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16782{
16783 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
16784 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
16785}
16786
16787
16788IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16789{
16790 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16791 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16792}
16793
16794
16795IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16796{
16797 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16798 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16799}
16800
16801
16802/**
16803 * [V]PALIGNR
16804 */
16805IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
16806{
16807 uint64_t const u64Src1 = *pu64Dst;
16808 ASMCompilerBarrier();
16809
16810 if (bEvil >= 16)
16811 *pu64Dst = 0;
16812 else if (bEvil >= 8)
16813 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
16814 else
16815 {
16816 uint8_t cShift = bEvil * 8;
16817 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
16818 | (u64Src2 >> cShift);
16819 }
16820}
16821
16822
16823IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16824{
16825 RTUINT128U const uSrc1 = *puDst;
16826 RTUINT128U const uSrc2 = *puSrc;
16827 ASMCompilerBarrier();
16828
16829 puDst->au64[0] = 0;
16830 puDst->au64[1] = 0;
16831 if (bEvil >= 32)
16832 { /* Everything stays 0. */ }
16833 else if (bEvil >= 16)
16834 {
16835 bEvil -= 16;
16836 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16837 puDst->au8[i - bEvil] = uSrc1.au8[i];
16838 }
16839 else
16840 {
16841 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16842 puDst->au8[i] = uSrc2.au8[i + bEvil];
16843 for (uint8_t i = 0; i < bEvil; i++)
16844 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16845 }
16846}
16847
16848
16849IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16850{
16851 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16852 RTUINT128U const uSrc2 = *puSrc2;
16853 ASMCompilerBarrier();
16854
16855 puDst->au64[0] = 0;
16856 puDst->au64[1] = 0;
16857 if (bEvil >= 32)
16858 { /* Everything stays 0. */ }
16859 else if (bEvil >= 16)
16860 {
16861 bEvil -= 16;
16862 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16863 puDst->au8[i - bEvil] = uSrc1.au8[i];
16864 }
16865 else
16866 {
16867 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16868 puDst->au8[i] = uSrc2.au8[i + bEvil];
16869 for (uint8_t i = 0; i < bEvil; i++)
16870 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16871 }
16872}
16873
16874
16875IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16876{
16877 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16878 RTUINT256U const uSrc2 = *puSrc2;
16879 ASMCompilerBarrier();
16880
16881 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16882 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16883}
16884
16885
16886/**
16887 * [V]PBLENDW
16888 */
16889IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16890{
16891 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16892 if (bEvil & RT_BIT(i))
16893 puDst->au16[i] = puSrc->au16[i];
16894}
16895
16896
16897IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16898{
16899 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16900 if (bEvil & RT_BIT(i))
16901 puDst->au16[i] = puSrc2->au16[i];
16902 else
16903 puDst->au16[i] = puSrc1->au16[i];
16904}
16905
16906
16907IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16908{
16909 for (uint8_t i = 0; i < 8; i++)
16910 if (bEvil & RT_BIT(i))
16911 {
16912 puDst->au16[ i] = puSrc2->au16[ i];
16913 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16914 }
16915 else
16916 {
16917 puDst->au16[ i] = puSrc1->au16[ i];
16918 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16919 }
16920}
16921
16922
16923/**
16924 * [V]BLENDPS
16925 */
16926IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16927{
16928 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16929 if (bEvil & RT_BIT(i))
16930 puDst->au32[i] = puSrc->au32[i];
16931}
16932
16933
16934IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16935{
16936 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16937 if (bEvil & RT_BIT(i))
16938 puDst->au32[i] = puSrc2->au32[i];
16939 else
16940 puDst->au32[i] = puSrc1->au32[i];
16941}
16942
16943
16944IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16945{
16946 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16947 if (bEvil & RT_BIT(i))
16948 puDst->au32[i] = puSrc2->au32[i];
16949 else
16950 puDst->au32[i] = puSrc1->au32[i];
16951}
16952
16953
16954/**
16955 * [V]BLENDPD
16956 */
16957IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16958{
16959 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16960 if (bEvil & RT_BIT(i))
16961 puDst->au64[i] = puSrc->au64[i];
16962}
16963
16964
16965IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16966{
16967 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16968 if (bEvil & RT_BIT(i))
16969 puDst->au64[i] = puSrc2->au64[i];
16970 else
16971 puDst->au64[i] = puSrc1->au64[i];
16972}
16973
16974
16975IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16976{
16977 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16978 if (bEvil & RT_BIT(i))
16979 puDst->au64[i] = puSrc2->au64[i];
16980 else
16981 puDst->au64[i] = puSrc1->au64[i];
16982}
16983
16984
16985/**
16986 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16987 */
16988
16989static uint8_t iemAImpl_aes_sbox[] = {
16990 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16991 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16992 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16993 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16994 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16995 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16996 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16997 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16998 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16999 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
17000 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
17001 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17002 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
17003 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
17004 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
17005 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
17006};
17007
17008/* The InvS-Box lookup table. */
17009static uint8_t iemAImpl_aes_inv_sbox[] = {
17010 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
17011 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
17012 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
17013 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
17014 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
17015 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
17016 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
17017 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
17018 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
17019 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
17020 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
17021 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
17022 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
17023 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
17024 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
17025 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
17026};
17027
17028/* The ShiftRows lookup table. */
17029static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
17030 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
17031};
17032
17033/* The InvShiftRows lookup table. */
17034static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
17035 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
17036};
17037
17038static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
17039{
17040 RTUINT128U uVal;
17041 int i;
17042
17043 for (i = 0; i < 16; ++i)
17044 uVal.au8[i] = abSubst[puSrc->au8[i]];
17045
17046 return uVal;
17047}
17048
17049static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
17050{
17051 return (u << 1) ^ (((u >> 7) & 1) * 27);
17052}
17053
17054static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
17055{
17056 RTUINT128U uVal;
17057 int i;
17058 uint8_t tmp;
17059
17060 for (i = 0; i < 16; i += 4) {
17061 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
17062 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
17063 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
17064 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
17065 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
17066 }
17067
17068 return uVal;
17069}
17070
17071static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
17072{
17073 RTUINT128U uVal;
17074 int i;
17075
17076 for (i = 0; i < 16; ++i)
17077 uVal.au8[i] = puSrc->au8[abShift[i]];
17078
17079 return uVal;
17080}
17081
17082static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
17083{
17084 uint8_t val;
17085
17086 val = ((b >> 0) & 1) * a;
17087 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
17088 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
17089 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
17090 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
17091
17092 return val;
17093}
17094
17095static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
17096{
17097 RTUINT128U uVal;
17098 int i;
17099
17100 for (i = 0; i < 16; i += 4) {
17101 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
17102 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
17103 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
17104 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
17105 }
17106
17107 return uVal;
17108}
17109
17110static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
17111{
17112 RTUINT32U uTmp;
17113
17114 uTmp.au32[0] = w;
17115 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
17116 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
17117 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
17118 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
17119
17120 return uTmp.au32[0];
17121}
17122
17123static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
17124{
17125 return (w << 24) | (w >> 8);
17126}
17127
17128/**
17129 * [V]AESKEYGENASSIST
17130 */
17131IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
17132{
17133 RTUINT128U uTmp;
17134 uint32_t uRCon = bImm; /* Round constant. */
17135
17136 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
17137 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
17138 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
17139 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
17140
17141 *puDst = uTmp;
17142}
17143
17144
17145/**
17146 * [V]AESIMC
17147 */
17148IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17149{
17150 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
17151}
17152
17153
17154/**
17155 * [V]AESENC
17156 */
17157IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17158{
17159 RTUINT128U uTmp;
17160
17161 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17162 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17163 uTmp = iemAImpl_aes_mix_col(&uTmp);
17164 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17165 uTmp.au64[1] ^= puSrc->au64[1];
17166
17167 *puDst = uTmp;
17168}
17169
17170
17171/**
17172 * [V]AESENCLAST
17173 */
17174IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17175{
17176 RTUINT128U uTmp;
17177
17178 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17179 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17180 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17181 uTmp.au64[1] ^= puSrc->au64[1];
17182
17183 *puDst = uTmp;
17184}
17185
17186
17187/**
17188 * [V]AESDEC
17189 */
17190IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17191{
17192 RTUINT128U uTmp;
17193
17194 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17195 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17196 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17197 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17198 uTmp.au64[1] ^= puSrc->au64[1];
17199
17200 *puDst = uTmp;
17201}
17202
17203
17204/**
17205 * [V]AESDECLAST
17206 */
17207IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17208{
17209 RTUINT128U uTmp;
17210
17211 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17212 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17213 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17214 uTmp.au64[1] ^= puSrc->au64[1];
17215
17216 *puDst = uTmp;
17217}
17218
17219
17220/**
17221 * [V]PCMPISTRI
17222 */
17223
17224/**
17225 * Does the comparisons based on the mode and source input format.
17226 */
17227static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17228{
17229#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17230 do \
17231 { \
17232 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17233 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17234 { \
17235 switch (a_bAggOp) \
17236 { \
17237 case 0: \
17238 case 2: \
17239 case 3: \
17240 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17241 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17242 break; \
17243 case 1: \
17244 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17245 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17246 break; \
17247 default: \
17248 AssertReleaseFailed(); \
17249 } \
17250 } \
17251 } while(0)
17252
17253 uint8_t bAggOp = (bImm >> 2) & 0x3;
17254 switch (bImm & 0x3)
17255 {
17256 case 0:
17257 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17258 break;
17259 case 1:
17260 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17261 break;
17262 case 2:
17263 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17264 break;
17265 case 3:
17266 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17267 break;
17268 default:
17269 AssertReleaseFailed();
17270 }
17271#undef PCMPXSTRX_CMP_CASE
17272}
17273
17274static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17275{
17276 if (bImm & 0x1)
17277 {
17278 /* Words -> 8 elements. */
17279 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17280 if (puSrc->au16[i] == 0)
17281 return i;
17282
17283 return 8;
17284 }
17285 else
17286 {
17287 /* Bytes -> 16 elements. */
17288 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17289 if (puSrc->au8[i] == 0)
17290 return i;
17291
17292 return 16;
17293 }
17294}
17295
17296static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17297{
17298 if (bImm & 0x1)
17299 {
17300 if (i64Len > -8 && i64Len < 8)
17301 return RT_ABS(i64Len);
17302
17303 return 8;
17304 }
17305 else
17306 {
17307 if (i64Len > -16 && i64Len < 16)
17308 return RT_ABS(i64Len);
17309
17310 return 16;
17311 }
17312}
17313
17314/**
17315 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17316 */
17317static const bool g_afCmpOverride[4][3] =
17318{
17319 /* xmm1 AND xmm2/m128 invalid xmm1 invalid, xmm2/m128 valid xmm1 valid, xmm2/m128 invalid */
17320 { false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17321 { false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17322 { true, false, false }, /* Imm8[3:2] = 10b (equal each) */
17323 { true, true, false }, /* Imm8[3:2] = 11b (equal ordered) */
17324};
17325
17326DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17327{
17328 if (fSrc1Valid && fSrc2Valid)
17329 return fCmpRes;
17330
17331 uint8_t bSrc1Valid = fSrc1Valid ? 2 : 0;
17332 uint8_t bSrc2Valid = fSrc2Valid ? 1 : 0;
17333 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17334}
17335
17336static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
17337{
17338 uint8_t bAggOp = (bImm >> 2) & 0x3;
17339 uint16_t u16Result = 0;
17340
17341 switch (bAggOp)
17342 {
17343 case 0: /* Equal any */
17344 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17345 {
17346 uint16_t u16Res = 0;
17347 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
17348 {
17349 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17350 idxSrc1 < idxLen1,
17351 idxSrc2 < idxLen2,
17352 bAggOp))
17353 {
17354 u16Res = RT_BIT(idxSrc2);
17355 break;
17356 }
17357 }
17358
17359 u16Result |= u16Res;
17360 }
17361 break;
17362
17363 case 1: /* Ranges */
17364 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17365 {
17366 uint16_t u16Res = 0;
17367 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
17368 {
17369 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17370 idxSrc1 < idxLen1,
17371 idxSrc2 < idxLen2,
17372 bAggOp)
17373 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
17374 (idxSrc1 + 1) < idxLen1,
17375 idxSrc2 < idxLen2,
17376 bAggOp))
17377 {
17378 u16Res = RT_BIT(idxSrc2);
17379 break;
17380 }
17381 }
17382
17383 u16Result |= u16Res;
17384 }
17385 break;
17386
17387 case 2: /* Equal each */
17388 for (uint8_t i = 0; i < cElems; i++)
17389 {
17390 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
17391 i < idxLen1,
17392 i < idxLen2,
17393 bAggOp))
17394 u16Result |= RT_BIT(i);
17395 }
17396 break;
17397
17398 case 3: /* Equal ordered */
17399 u16Result = 0;
17400 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17401 {
17402 uint16_t u16Res = RT_BIT(idxSrc2);
17403 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
17404 {
17405 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
17406 idxSrc1 < idxLen1,
17407 k < idxLen2,
17408 bAggOp))
17409 {
17410 u16Res = 0;
17411 break;
17412 }
17413 }
17414
17415 u16Result |= u16Res;
17416 }
17417 break;
17418 }
17419
17420 /* Polarity selection. */
17421 switch ((bImm >> 4) & 0x3)
17422 {
17423 case 0:
17424 case 2:
17425 /* Nothing to do. */
17426 break;
17427 case 1:
17428 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
17429 break;
17430 case 3:
17431 u16Result ^= RT_BIT(idxLen2) - 1;
17432 break;
17433 default:
17434 AssertReleaseFailed();
17435 }
17436
17437 return u16Result;
17438}
17439
17440DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
17441{
17442 uint32_t fEFlags = 0;
17443
17444 if (u16Result)
17445 fEFlags |= X86_EFL_CF;
17446 if (cLen2 < cElems)
17447 fEFlags |= X86_EFL_ZF;
17448 if (cLen1 < cElems)
17449 fEFlags |= X86_EFL_SF;
17450 if (u16Result & 0x1)
17451 fEFlags |= X86_EFL_OF;
17452 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
17453}
17454
17455DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
17456 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
17457{
17458 bool afCmpRes[16][16];
17459 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17460
17461 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
17462 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
17463 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
17464
17465 return u16Result;
17466}
17467
17468DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17469{
17470 if (bImm & RT_BIT(6))
17471 {
17472 /* Index for MSB set. */
17473 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
17474 if (idxMsb)
17475 *pu32Ecx = idxMsb - 1;
17476 else
17477 *pu32Ecx = cElems;
17478 }
17479 else
17480 {
17481 /* Index for LSB set. */
17482 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
17483 if (idxLsb)
17484 *pu32Ecx = idxLsb - 1;
17485 else
17486 *pu32Ecx = cElems;
17487 }
17488}
17489
17490IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17491{
17492 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17493 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17494 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17495
17496 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17497 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17498}
17499
17500
17501/**
17502 * [V]PCMPESTRI
17503 */
17504IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17505{
17506 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17507 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17508 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17509
17510 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17511 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17512}
17513
17514
17515/**
17516 * [V]PCMPISTRM
17517 */
17518DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17519{
17520 if (bImm & RT_BIT(6))
17521 {
17522 /* Generate a mask. */
17523 if (cElems == 8)
17524 {
17525 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17526 if (u16Result & RT_BIT(i))
17527 puDst->au16[i] = 0xffff;
17528 else
17529 puDst->au16[i] = 0;
17530 }
17531 else
17532 {
17533 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17534 if (u16Result & RT_BIT(i))
17535 puDst->au8[i] = 0xff;
17536 else
17537 puDst->au8[i] = 0;
17538 }
17539 }
17540 else
17541 {
17542 /* Store the result. */
17543 puDst->au64[0] = u16Result;
17544 puDst->au64[1] = 0;
17545 }
17546}
17547
17548IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17549{
17550 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17551 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17552 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17553
17554 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17555 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
17556}
17557
17558
17559/**
17560 * [V]PCMPESTRM
17561 */
17562IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17563{
17564 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17565 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17566 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17567
17568 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17569 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
17570}
17571
17572
17573/*
17574 * [V]PCLMULQDQ
17575 */
17576IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17577{
17578 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
17579}
17580
17581
17582IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17583{
17584 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
17585 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
17586
17587 puDst->au64[0] = 0;
17588 puDst->au64[1] = 0;
17589
17590 /*
17591 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
17592 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
17593 * and squeeze out some optimizations.
17594 */
17595 if (uSrc1 & 0x1)
17596 puDst->au64[0] = uSrc2;
17597
17598 uSrc1 >>= 1;
17599
17600 uint8_t iDigit = 1;
17601 while (uSrc1)
17602 {
17603 if (uSrc1 & 0x1)
17604 {
17605 puDst->au64[0] ^= (uSrc2 << iDigit);
17606 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
17607 }
17608
17609 uSrc1 >>= 1;
17610 iDigit++;
17611 }
17612}
17613
17614
17615/**
17616 * [V]PINSRW
17617 */
17618#ifdef IEM_WITHOUT_ASSEMBLY
17619IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
17620{
17621 uint8_t cShift = (bEvil & 0x3) * 16;
17622 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
17623}
17624
17625
17626IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
17627{
17628 puDst->au16[bEvil & 0x7] = u16Src;
17629}
17630#endif
17631
17632
17633IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
17634{
17635 *puDst = *puSrc;
17636 puDst->au16[bEvil & 0x7] = u16Src;
17637}
17638
17639
17640/**
17641 * [V]PEXTRW
17642 */
17643#ifdef IEM_WITHOUT_ASSEMBLY
17644IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
17645{
17646 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
17647}
17648
17649
17650IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
17651{
17652 *pu16Dst = puSrc->au16[bEvil & 0x7];
17653}
17654
17655#endif
17656
17657IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
17658{
17659 *pu16Dst = puSrc->au16[bEvil & 0x7];
17660}
17661
17662
17663/**
17664 * [V]MOVMSKPS
17665 */
17666#ifdef IEM_WITHOUT_ASSEMBLY
17667IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17668{
17669 *pu8Dst = puSrc->au32[0] >> 31;
17670 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17671 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17672 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17673}
17674
17675#endif
17676
17677IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17678{
17679 *pu8Dst = puSrc->au32[0] >> 31;
17680 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17681 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17682 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17683}
17684
17685
17686IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17687{
17688 *pu8Dst = puSrc->au32[0] >> 31;
17689 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17690 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17691 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17692 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
17693 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
17694 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
17695 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
17696}
17697
17698
17699/**
17700 * [V]MOVMSKPD
17701 */
17702#ifdef IEM_WITHOUT_ASSEMBLY
17703IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17704{
17705 *pu8Dst = puSrc->au64[0] >> 63;
17706 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17707}
17708
17709#endif
17710
17711IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17712{
17713 *pu8Dst = puSrc->au64[0] >> 63;
17714 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17715}
17716
17717
17718IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17719{
17720 *pu8Dst = puSrc->au64[0] >> 63;
17721 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17722 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
17723 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
17724}
17725
17726
17727/**
17728 * CVTTSD2SI
17729 */
17730#ifdef IEM_WITHOUT_ASSEMBLY
17731IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17732{
17733 RTFLOAT64U r64Src;
17734
17735 r64Src.u = *pu64Src;
17736 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17737
17738 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17739 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17740 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17741}
17742
17743
17744IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17745{
17746 RTFLOAT64U r64Src;
17747
17748 r64Src.u = *pu64Src;
17749 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17750
17751 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17752 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17753 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17754}
17755#endif
17756
17757
17758/**
17759 * CVTSD2SI
17760 */
17761#ifdef IEM_WITHOUT_ASSEMBLY
17762IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17763{
17764 RTFLOAT64U r64Src;
17765
17766 r64Src.u = *pu64Src;
17767 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17768
17769 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17770 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17771 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17772}
17773
17774
17775IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17776{
17777 RTFLOAT64U r64Src;
17778
17779 r64Src.u = *pu64Src;
17780 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17781
17782 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17783 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17784 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17785}
17786#endif
17787
17788
17789/**
17790 * CVTTSS2SI
17791 */
17792#ifdef IEM_WITHOUT_ASSEMBLY
17793IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17794{
17795 RTFLOAT32U r32Src;
17796
17797 r32Src.u = *pu32Src;
17798 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17799
17800 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17801 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17802 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17803}
17804
17805
17806IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17807{
17808 RTFLOAT32U r32Src;
17809
17810 r32Src.u = *pu32Src;
17811 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17812
17813 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17814 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17815 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17816}
17817#endif
17818
17819
17820/**
17821 * CVTSS2SI
17822 */
17823#ifdef IEM_WITHOUT_ASSEMBLY
17824IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17825{
17826 RTFLOAT32U r32Src;
17827
17828 r32Src.u = *pu32Src;
17829 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17830
17831 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17832 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17833 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17834}
17835
17836
17837IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17838{
17839 RTFLOAT32U r32Src;
17840
17841 r32Src.u = *pu32Src;
17842 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17843
17844 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17845 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17846 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17847}
17848#endif
17849
17850
17851/**
17852 * CVTSI2SD
17853 */
17854#ifdef IEM_WITHOUT_ASSEMBLY
17855IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
17856{
17857 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17858 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
17859 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17860}
17861
17862
17863IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
17864{
17865 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17866 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
17867 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17868}
17869#endif
17870
17871
17872/**
17873 * CVTSI2SS
17874 */
17875#ifdef IEM_WITHOUT_ASSEMBLY
17876IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
17877{
17878 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17879 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
17880 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17881}
17882
17883
17884IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
17885{
17886 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17887 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
17888 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17889}
17890#endif
17891
17892
17893/**
17894 * [V]UCOMISS
17895 */
17896#ifdef IEM_WITHOUT_ASSEMBLY
17897IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17898{
17899 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17900
17901 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
17902 {
17903 *pfMxcsr |= X86_MXCSR_IE;
17904 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17905 }
17906 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17907 {
17908 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17909 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17910 }
17911 else
17912 {
17913 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17914
17915 RTFLOAT32U r32Src1, r32Src2;
17916 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17917 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17918
17919 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17920 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17921 if (f32_eq(f32Src1, f32Src2, &SoftState))
17922 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17923 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17924 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17925 /* else: GREATER_THAN 000 */
17926
17927 *pfMxcsr |= fDe;
17928 }
17929
17930 *pfEFlags = fEFlagsNew;
17931}
17932#endif
17933
17934IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17935{
17936 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17937}
17938
17939
17940/**
17941 * [V]UCOMISD
17942 */
17943#ifdef IEM_WITHOUT_ASSEMBLY
17944IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17945{
17946 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17947
17948 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
17949 {
17950 *pfMxcsr |= X86_MXCSR_IE;
17951 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17952 }
17953 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17954 {
17955 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17956 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17957 }
17958 else
17959 {
17960 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17961
17962 RTFLOAT64U r64Src1, r64Src2;
17963 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0])
17964 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17965
17966 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17967 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17968 if (f64_eq(f64Src1, f64Src2, &SoftState))
17969 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17970 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17971 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17972 /* else: GREATER_THAN 000 */
17973
17974 *pfMxcsr |= fDe;
17975 }
17976
17977 *pfEFlags = fEFlagsNew;
17978}
17979#endif
17980
17981IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17982{
17983 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17984}
17985
17986
17987/**
17988 * [V]COMISS
17989 */
17990#ifdef IEM_WITHOUT_ASSEMBLY
17991IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17992{
17993 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17994
17995 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
17996 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17997 {
17998 *pfMxcsr |= X86_MXCSR_IE;
17999 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18000 }
18001 else
18002 {
18003 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18004
18005 RTFLOAT32U r32Src1, r32Src2;
18006 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0])
18007 | iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
18008
18009 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18010 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18011 if (f32_eq(f32Src1, f32Src2, &SoftState))
18012 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18013 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18014 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18015 /* else: GREATER_THAN 000 */
18016
18017 *pfMxcsr |= fDe;
18018 }
18019
18020 *pfEFlags = fEFlagsNew;
18021}
18022#endif
18023
18024
18025IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18026{
18027 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18028}
18029
18030
18031/**
18032 * [V]COMISD
18033 */
18034#ifdef IEM_WITHOUT_ASSEMBLY
18035IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18036{
18037 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18038
18039 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
18040 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
18041 {
18042 *pfMxcsr |= X86_MXCSR_IE;
18043 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18044 }
18045 else
18046 {
18047 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18048
18049 RTFLOAT64U r64Src1, r64Src2;
18050 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
18051 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
18052
18053 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18054 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18055 if (f64_eq(f64Src1, f64Src2, &SoftState))
18056 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18057 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18058 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18059 /* else: GREATER_THAN 000 */
18060
18061 *pfMxcsr |= fDe;
18062 }
18063
18064 *pfEFlags = fEFlagsNew;
18065}
18066#endif
18067
18068IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
18069{
18070 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
18071}
18072
18073
18074/**
18075 * CMPPS / CMPPD / CMPSS / CMPSD
18076 */
18077#ifdef IEM_WITHOUT_ASSEMBLY
18078/**
18079 * A compare truth table entry.
18080 */
18081typedef struct CMPTRUTHTBLENTRY
18082{
18083 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
18084 bool fSignalsOnQNan;
18085 /** The boolean result when the input operands are unordered. */
18086 bool fUnordered;
18087 /** The boolean result when A = B. */
18088 bool fEqual;
18089 /** The boolean result when A < B. */
18090 bool fLowerThan;
18091 /** The boolean result when A > B. */
18092 bool fGreaterThan;
18093} CMPTRUTHTBLENTRY;
18094/** Pointer to a const truth table entry. */
18095typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
18096
18097
18098/** The compare truth table (indexed by immediate). */
18099static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
18100{
18101 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
18102 /* 00H (EQ_OQ) */ { false, false, true, false, false },
18103 /* 01H (LT_OS) */ { true, false, false, true, false },
18104 /* 02H (LE_OS) */ { true, false, true, true, false },
18105 /* 03H (UNORD_Q) */ { false, true, false, false, false },
18106 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
18107 /* 05H (NLT_US) */ { true, true, true, false, true },
18108 /* 06H (NLE_US) */ { true, true, false, false, true },
18109 /* 07H (ORQ_Q) */ { false, false, true, true, true },
18110 /** @todo AVX variants. */
18111};
18112
18113
18114static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
18115{
18116 bool fRes;
18117 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18118
18119 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
18120 {
18121 *pfMxcsr |= X86_MXCSR_IE;
18122 fRes = g_aCmpTbl[bEvil].fUnordered;
18123 }
18124 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
18125 {
18126 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18127 *pfMxcsr |= X86_MXCSR_IE;
18128 fRes = g_aCmpTbl[bEvil].fUnordered;
18129 }
18130 else
18131 {
18132 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18133
18134 RTFLOAT32U r32Src1, r32Src2;
18135 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
18136 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
18137
18138 *pfMxcsr |= fDe;
18139 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18140 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18141 if (f32_eq(f32Src1, f32Src2, &SoftState))
18142 fRes = g_aCmpTbl[bEvil].fEqual;
18143 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18144 fRes = g_aCmpTbl[bEvil].fLowerThan;
18145 else
18146 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18147 }
18148
18149 return fRes;
18150}
18151
18152
18153static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
18154{
18155 bool fRes;
18156 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18157
18158 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
18159 {
18160 *pfMxcsr |= X86_MXCSR_IE;
18161 fRes = g_aCmpTbl[bEvil].fUnordered;
18162 }
18163 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
18164 {
18165 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18166 *pfMxcsr |= X86_MXCSR_IE;
18167 fRes = g_aCmpTbl[bEvil].fUnordered;
18168 }
18169 else
18170 {
18171 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18172
18173 RTFLOAT64U r64Src1, r64Src2;
18174 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
18175 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
18176
18177 *pfMxcsr |= fDe;
18178 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18179 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18180 if (f64_eq(f64Src1, f64Src2, &SoftState))
18181 fRes = g_aCmpTbl[bEvil].fEqual;
18182 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18183 fRes = g_aCmpTbl[bEvil].fLowerThan;
18184 else
18185 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18186 }
18187
18188 return fRes;
18189}
18190
18191
18192IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18193{
18194 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18195 {
18196 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18197 puDst->au32[i] = UINT32_MAX;
18198 else
18199 puDst->au32[i] = 0;
18200 }
18201}
18202
18203
18204IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18205{
18206 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18207 {
18208 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18209 puDst->au64[i] = UINT64_MAX;
18210 else
18211 puDst->au64[i] = 0;
18212 }
18213}
18214
18215
18216IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18217{
18218 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18219 puDst->au32[0] = UINT32_MAX;
18220 else
18221 puDst->au32[0] = 0;
18222
18223 puDst->au32[1] = pSrc->uSrc1.au32[1];
18224 puDst->au64[1] = pSrc->uSrc1.au64[1];
18225}
18226
18227
18228IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18229{
18230 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18231 puDst->au64[0] = UINT64_MAX;
18232 else
18233 puDst->au64[0] = 0;
18234
18235 puDst->au64[1] = pSrc->uSrc1.au64[1];
18236}
18237#endif
18238
18239
18240/**
18241 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18242 */
18243
18244#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18245#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18246#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18247
18248#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18249
18250DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18251{
18252 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18253 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18254
18255 fMxcsr &= ~X86_MXCSR_RC_MASK;
18256 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18257 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18258}
18259
18260static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18261{
18262 RTFLOAT32U r32Src, r32Dst;
18263 float32_t f32Src;
18264 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18265 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18266
18267 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18268 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18269
18270 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18271 return r32Dst;
18272}
18273
18274static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18275{
18276 RTFLOAT64U r64Src, r64Dst;
18277 float64_t f64Src;
18278 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18279 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18280
18281 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18282 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18283
18284 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18285 return r64Dst;
18286}
18287
18288#ifdef IEM_WITHOUT_ASSEMBLY
18289IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18290{
18291 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18292 puDst->au32[1] = pSrc->uSrc1.au32[1];
18293 puDst->au64[1] = pSrc->uSrc1.au64[1];
18294}
18295
18296
18297IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18298{
18299 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18300 puDst->au64[1] = pSrc->uSrc1.au64[1];
18301}
18302#endif
18303
18304IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18305{
18306 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18307 {
18308 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18309 }
18310}
18311
18312
18313IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18314{
18315 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18316 {
18317 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18318 }
18319}
18320
18321/**
18322 * CVTPD2PI
18323 */
18324#ifdef IEM_WITHOUT_ASSEMBLY
18325static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18326{
18327 RTFLOAT64U r64Src;
18328 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18329
18330 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18331 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18332 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18333}
18334
18335
18336IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18337{
18338 RTUINT64U u64Res;
18339 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18340 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18341
18342 *pu64Dst = u64Res.u;
18343 *pfMxcsr = fMxcsrOut;
18344}
18345#endif
18346
18347
18348/**
18349 * CVTTPD2PI
18350 */
18351#ifdef IEM_WITHOUT_ASSEMBLY
18352static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18353{
18354 RTFLOAT64U r64Src;
18355 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18356
18357 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18358 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18359 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18360}
18361
18362
18363IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18364{
18365 RTUINT64U u64Res;
18366 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
18367 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
18368
18369 *pu64Dst = u64Res.u;
18370 *pfMxcsr = fMxcsrOut;
18371}
18372#endif
18373
18374
18375/**
18376 * CVTPI2PS
18377 */
18378#ifdef IEM_WITHOUT_ASSEMBLY
18379static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
18380{
18381 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18382 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
18383 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
18384}
18385
18386
18387IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
18388{
18389 RTUINT64U uSrc = { u64Src };
18390 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
18391 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
18392 *pfMxcsr = fMxcsrOut;
18393}
18394#endif
18395
18396
18397/**
18398 * CVTPI2PD
18399 */
18400#ifdef IEM_WITHOUT_ASSEMBLY
18401static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
18402{
18403 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18404 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
18405 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
18406}
18407
18408
18409IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
18410{
18411 RTUINT64U uSrc = { u64Src };
18412 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
18413 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
18414 *pfMxcsr = fMxcsrOut;
18415}
18416#endif
18417
18418
18419/**
18420 * CVTPS2PI
18421 */
18422#ifdef IEM_WITHOUT_ASSEMBLY
18423static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18424{
18425 RTFLOAT32U r32Src;
18426 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18427
18428 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18429 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18430 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18431}
18432
18433
18434IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
18435{
18436 RTUINT64U uDst;
18437 RTUINT64U uSrc = { u64Src };
18438 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18439 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18440 *pu64Dst = uDst.u;
18441 *pfMxcsr = fMxcsrOut;
18442}
18443#endif
18444
18445
18446/**
18447 * CVTTPS2PI
18448 */
18449#ifdef IEM_WITHOUT_ASSEMBLY
18450static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18451{
18452 RTFLOAT32U r32Src;
18453 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18454
18455 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18456 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18457 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18458}
18459
18460
18461IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
18462{
18463 RTUINT64U uDst;
18464 RTUINT64U uSrc = { u64Src };
18465 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18466 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18467 *pu64Dst = uDst.u;
18468 *pfMxcsr = fMxcsrOut;
18469}
18470#endif
18471
18472/**
18473 * RDRAND
18474 */
18475IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18476{
18477 *puDst = 0;
18478 *pEFlags &= ~X86_EFL_STATUS_BITS;
18479 *pEFlags |= X86_EFL_CF;
18480}
18481
18482IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18483{
18484 *puDst = 0;
18485 *pEFlags &= ~X86_EFL_STATUS_BITS;
18486 *pEFlags |= X86_EFL_CF;
18487}
18488
18489IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18490{
18491 *puDst = 0;
18492 *pEFlags &= ~X86_EFL_STATUS_BITS;
18493 *pEFlags |= X86_EFL_CF;
18494}
18495
18496/**
18497 * RDSEED
18498 */
18499IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18500{
18501 *puDst = 0;
18502 *pEFlags &= ~X86_EFL_STATUS_BITS;
18503 *pEFlags |= X86_EFL_CF;
18504}
18505
18506IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18507{
18508 *puDst = 0;
18509 *pEFlags &= ~X86_EFL_STATUS_BITS;
18510 *pEFlags |= X86_EFL_CF;
18511}
18512
18513IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18514{
18515 *puDst = 0;
18516 *pEFlags &= ~X86_EFL_STATUS_BITS;
18517 *pEFlags |= X86_EFL_CF;
18518}
18519
18520
18521/**
18522 * SHA1NEXTE
18523 */
18524IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18525{
18526 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
18527
18528 puDst->au32[0] = puSrc->au32[0];
18529 puDst->au32[1] = puSrc->au32[1];
18530 puDst->au32[2] = puSrc->au32[2];
18531 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
18532}
18533
18534/**
18535 * SHA1MSG1
18536 */
18537IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18538{
18539 uint32_t u32W0 = puDst->au32[3];
18540 uint32_t u32W1 = puDst->au32[2];
18541 uint32_t u32W2 = puDst->au32[1];
18542 uint32_t u32W3 = puDst->au32[0];
18543 uint32_t u32W4 = puSrc->au32[3];
18544 uint32_t u32W5 = puSrc->au32[2];
18545
18546 puDst->au32[3] = u32W2 ^ u32W0;
18547 puDst->au32[2] = u32W3 ^ u32W1;
18548 puDst->au32[1] = u32W4 ^ u32W2;
18549 puDst->au32[0] = u32W5 ^ u32W3;
18550}
18551
18552/**
18553 * SHA1MSG2
18554 */
18555IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18556{
18557 uint32_t u32W13 = puSrc->au32[2];
18558 uint32_t u32W14 = puSrc->au32[1];
18559 uint32_t u32W15 = puSrc->au32[0];
18560 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
18561 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
18562 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
18563 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
18564
18565 puDst->au32[3] = u32W16;
18566 puDst->au32[2] = u32W17;
18567 puDst->au32[1] = u32W18;
18568 puDst->au32[0] = u32W19;
18569}
18570
18571/**
18572 * SHA1RNDS4
18573 */
18574typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
18575typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
18576
18577static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18578{
18579 return (u32B & u32C) ^ (~u32B & u32D);
18580}
18581
18582static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18583{
18584 return u32B ^ u32C ^ u32D;
18585}
18586
18587static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18588{
18589 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
18590}
18591
18592static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18593{
18594 return u32B ^ u32C ^ u32D;
18595}
18596
18597IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18598{
18599 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
18600 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
18601
18602 uint32_t au32A[5];
18603 uint32_t au32B[5];
18604 uint32_t au32C[5];
18605 uint32_t au32D[5];
18606 uint32_t au32E[5];
18607 uint32_t au32W[4];
18608 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
18609 uint32_t u32K = s_au32K[bEvil & 0x3];
18610
18611 au32A[0] = puDst->au32[3];
18612 au32B[0] = puDst->au32[2];
18613 au32C[0] = puDst->au32[1];
18614 au32D[0] = puDst->au32[0];
18615 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
18616 au32W[i] = puSrc->au32[3 - i];
18617
18618 /* Round 0 is a bit different than the other rounds. */
18619 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
18620 au32B[1] = au32A[0];
18621 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
18622 au32D[1] = au32C[0];
18623 au32E[1] = au32D[0];
18624
18625 for (uint32_t i = 1; i <= 3; i++)
18626 {
18627 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
18628 au32B[i + 1] = au32A[i];
18629 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
18630 au32D[i + 1] = au32C[i];
18631 au32E[i + 1] = au32D[i];
18632 }
18633
18634 puDst->au32[3] = au32A[4];
18635 puDst->au32[2] = au32B[4];
18636 puDst->au32[1] = au32C[4];
18637 puDst->au32[0] = au32D[4];
18638}
18639
18640
18641/**
18642 * SHA256MSG1
18643 */
18644DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
18645{
18646 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
18647}
18648
18649IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18650{
18651 uint32_t u32W4 = puSrc->au32[0];
18652 uint32_t u32W3 = puDst->au32[3];
18653 uint32_t u32W2 = puDst->au32[2];
18654 uint32_t u32W1 = puDst->au32[1];
18655 uint32_t u32W0 = puDst->au32[0];
18656
18657 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
18658 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
18659 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
18660 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
18661}
18662
18663/**
18664 * SHA256MSG2
18665 */
18666DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
18667{
18668 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
18669}
18670
18671IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18672{
18673 uint32_t u32W14 = puSrc->au32[2];
18674 uint32_t u32W15 = puSrc->au32[3];
18675 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
18676 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
18677 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
18678 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
18679
18680 puDst->au32[3] = u32W19;
18681 puDst->au32[2] = u32W18;
18682 puDst->au32[1] = u32W17;
18683 puDst->au32[0] = u32W16;
18684}
18685
18686/**
18687 * SHA256RNDS2
18688 */
18689DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18690{
18691 return (u32X & u32Y) ^ (~u32X & u32Z);
18692}
18693
18694DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18695{
18696 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
18697}
18698
18699DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
18700{
18701 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
18702}
18703
18704DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
18705{
18706 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
18707}
18708
18709IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
18710{
18711 uint32_t au32A[3];
18712 uint32_t au32B[3];
18713 uint32_t au32C[3];
18714 uint32_t au32D[3];
18715 uint32_t au32E[3];
18716 uint32_t au32F[3];
18717 uint32_t au32G[3];
18718 uint32_t au32H[3];
18719 uint32_t au32WK[2];
18720
18721 au32A[0] = puSrc->au32[3];
18722 au32B[0] = puSrc->au32[2];
18723 au32C[0] = puDst->au32[3];
18724 au32D[0] = puDst->au32[2];
18725 au32E[0] = puSrc->au32[1];
18726 au32F[0] = puSrc->au32[0];
18727 au32G[0] = puDst->au32[1];
18728 au32H[0] = puDst->au32[0];
18729
18730 au32WK[0] = puXmm0Constants->au32[0];
18731 au32WK[1] = puXmm0Constants->au32[1];
18732
18733 for (uint32_t i = 0; i < 2; i++)
18734 {
18735 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18736 + iemAImpl_sha256_upper_sigma1(au32E[i])
18737 + au32WK[i]
18738 + au32H[i]
18739 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
18740 + iemAImpl_sha256_upper_sigma0(au32A[i]);
18741 au32B[i + 1] = au32A[i];
18742 au32C[i + 1] = au32B[i];
18743 au32D[i + 1] = au32C[i];
18744 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18745 + iemAImpl_sha256_upper_sigma1(au32E[i])
18746 + au32WK[i]
18747 + au32H[i]
18748 + au32D[i];
18749 au32F[i + 1] = au32E[i];
18750 au32G[i + 1] = au32F[i];
18751 au32H[i + 1] = au32G[i];
18752 }
18753
18754 puDst->au32[3] = au32A[2];
18755 puDst->au32[2] = au32B[2];
18756 puDst->au32[1] = au32E[2];
18757 puDst->au32[0] = au32F[2];
18758}
18759
18760
18761/**
18762 * ADCX
18763 */
18764#define ADX_EMIT(a_Flag, a_Type, a_Max) \
18765 do \
18766 { \
18767 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
18768 a_Type uTmp = *puDst + uSrc; \
18769 if (uTmp < uSrc) \
18770 *pfEFlags |= (a_Flag); \
18771 else \
18772 *pfEFlags &= ~(a_Flag); \
18773 if ( uTmp == a_Max \
18774 && f) \
18775 *pfEFlags |= (a_Flag); \
18776 if (f) \
18777 uTmp++; \
18778 *puDst = uTmp; \
18779 } \
18780 while (0)
18781
18782IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
18783{
18784 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
18785}
18786
18787IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
18788{
18789 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
18790}
18791
18792# if defined(IEM_WITHOUT_ASSEMBLY)
18793
18794IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
18795{
18796 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
18797}
18798
18799IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
18800{
18801 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
18802}
18803
18804#endif
18805
18806
18807/**
18808 * ADOX
18809 */
18810IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
18811{
18812 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
18813}
18814
18815IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
18816{
18817 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
18818}
18819
18820# if defined(IEM_WITHOUT_ASSEMBLY)
18821
18822IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
18823{
18824 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
18825}
18826
18827IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
18828{
18829 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
18830}
18831
18832# endif
18833
18834
18835/**
18836 * MPSADBW
18837 */
18838IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18839{
18840 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18841 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18842 int16_t ai16Src1[11];
18843 int16_t ai16Src2[4];
18844
18845 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18846 ai16Src1[i] = puDst->au8[idxSrc1 + i];
18847
18848 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18849 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
18850
18851 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18852 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18853 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18854 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18855 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18856}
18857
18858
18859IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18860{
18861 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18862 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18863 int16_t ai16Src1[11];
18864 int16_t ai16Src2[4];
18865
18866 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18867 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
18868
18869 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18870 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
18871
18872 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18873 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18874 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18875 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18876 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18877}
18878
18879
18880IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
18881{
18882 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
18883 RTUINT256U const uSrc2 = *puSrc2;
18884 ASMCompilerBarrier();
18885 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
18886 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
18887}
18888
18889
18890/**
18891 * VPERM2I128
18892 */
18893IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
18894{
18895 if (bImm & RT_BIT(3))
18896 {
18897 puDst->au64[0] = 0;
18898 puDst->au64[1] = 0;
18899 }
18900 else
18901 {
18902 switch (bImm & 0x3)
18903 {
18904 case 0:
18905 puDst->au64[0] = puSrc1->au64[0];
18906 puDst->au64[1] = puSrc1->au64[1];
18907 break;
18908 case 1:
18909 puDst->au64[0] = puSrc1->au64[2];
18910 puDst->au64[1] = puSrc1->au64[3];
18911 break;
18912 case 2:
18913 puDst->au64[0] = puSrc2->au64[0];
18914 puDst->au64[1] = puSrc2->au64[1];
18915 break;
18916 case 3:
18917 puDst->au64[0] = puSrc2->au64[2];
18918 puDst->au64[1] = puSrc2->au64[3];
18919 break;
18920 }
18921 }
18922
18923 if (bImm & RT_BIT(7))
18924 {
18925 puDst->au64[2] = 0;
18926 puDst->au64[3] = 0;
18927 }
18928 else
18929 {
18930 switch ((bImm >> 4) & 0x3)
18931 {
18932 case 0:
18933 puDst->au64[2] = puSrc1->au64[0];
18934 puDst->au64[3] = puSrc1->au64[1];
18935 break;
18936 case 1:
18937 puDst->au64[2] = puSrc1->au64[2];
18938 puDst->au64[3] = puSrc1->au64[3];
18939 break;
18940 case 2:
18941 puDst->au64[2] = puSrc2->au64[0];
18942 puDst->au64[3] = puSrc2->au64[1];
18943 break;
18944 case 3:
18945 puDst->au64[2] = puSrc2->au64[2];
18946 puDst->au64[3] = puSrc2->au64[3];
18947 break;
18948 }
18949 }
18950}
18951
18952
18953/**
18954 * VPERM2F128
18955 */
18956IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
18957{
18958 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
18959}
18960
18961
18962/**
18963 * DPPS
18964 */
18965IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18966{
18967 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18968 AssertReleaseFailed();
18969}
18970
18971
18972/**
18973 * DPPD
18974 */
18975IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18976{
18977 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18978 AssertReleaseFailed();
18979}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette