VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 97321

Last change on this file since 97321 was 97319, checked in by vboxsync, 2 years ago

VMM/IEM: Underflow signalling in fsincos instruction as described in 1985 version of IEEE 754, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 645.8 KB
Line 
1/* $Id: IEMAllAImplC.cpp 97319 2022-10-27 12:14:13Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6124{
6125 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6126 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6127 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6128 extFloat80_t v;
6129 (void)fFcw;
6130
6131 v = extF80_atan2(y, x, &SoftState);
6132
6133 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6134 return fFsw;
6135}
6136
6137IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6138 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6139{
6140 uint16_t const fFcw = pFpuState->FCW;
6141 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6142
6143 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6144 {
6145 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6146
6147 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6148 if (!(fFcw & X86_FCW_PM))
6149 fFsw |= X86_FSW_ES | X86_FSW_B;
6150 }
6151 else
6152 {
6153 fFsw |= X86_FSW_IE;
6154 if (!(fFcw & X86_FCW_IM))
6155 {
6156 pFpuRes->r80Result = *pr80Val2;
6157 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6158 }
6159 else
6160 {
6161 pFpuRes->r80Result = g_r80Indefinite;
6162 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6163 }
6164 }
6165
6166 pFpuRes->FSW = fFsw;
6167}
6168#endif /* IEM_WITHOUT_ASSEMBLY */
6169
6170IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6171 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6172{
6173 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6174}
6175
6176IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6177 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6178{
6179 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6180}
6181
6182
6183#if defined(IEM_WITHOUT_ASSEMBLY)
6184static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6185{
6186 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6187 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6188 extFloat80_t v;
6189 (void)fFcw;
6190
6191 v = extF80_tan(x, &SoftState);
6192
6193 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6194 return fFsw;
6195}
6196
6197IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6198{
6199 uint16_t const fFcw = pFpuState->FCW;
6200 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6201
6202 if (RTFLOAT80U_IS_ZERO(pr80Val))
6203 {
6204 pFpuResTwo->r80Result1 = *pr80Val;
6205 pFpuResTwo->r80Result2 = g_ar80One[0];
6206 }
6207 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6208 {
6209 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6210 {
6211 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6212 pFpuResTwo->r80Result1 = *pr80Val;
6213 }
6214 else
6215 {
6216 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6217 {
6218 pFpuResTwo->r80Result1 = *pr80Val;
6219 }
6220 else
6221 {
6222 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6223 }
6224
6225 pFpuResTwo->r80Result2 = g_ar80One[0];
6226
6227 fFsw |= X86_FSW_PE;
6228 if (!(fFcw & X86_FCW_PM))
6229 fFsw |= X86_FSW_ES | X86_FSW_B;
6230 }
6231 }
6232 else
6233 {
6234 fFsw |= X86_FSW_IE;
6235 if (!(fFcw & X86_FCW_IM))
6236 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6237 }
6238
6239 pFpuResTwo->FSW = fFsw;
6240}
6241#endif /* IEM_WITHOUT_ASSEMBLY */
6242
6243IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6244{
6245 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6246}
6247
6248IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6249{
6250 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6251}
6252
6253#ifdef IEM_WITHOUT_ASSEMBLY
6254
6255static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6256{
6257 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6258 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6259 extFloat80_t v;
6260 (void)fFcw;
6261
6262 v = extF80_sin(x, &SoftState);
6263
6264 iemFpuSoftF80ToIprt(pr80Result, v);
6265
6266 return fFsw;
6267}
6268
6269IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6270{
6271 uint16_t const fFcw = pFpuState->FCW;
6272 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6273
6274 if (RTFLOAT80U_IS_ZERO(pr80Val))
6275 {
6276 pFpuRes->r80Result = *pr80Val;
6277 }
6278 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6279 {
6280 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6281 {
6282 fFsw |= X86_FSW_C2;
6283 pFpuRes->r80Result = *pr80Val;
6284 }
6285 else
6286 {
6287 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6288 {
6289 pFpuRes->r80Result = *pr80Val;
6290
6291 }
6292 else
6293 {
6294 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6295 }
6296 fFsw |= X86_FSW_PE;
6297 if (!(fFcw & X86_FCW_PM))
6298 fFsw |= X86_FSW_ES | X86_FSW_B;
6299 }
6300 }
6301 else if (RTFLOAT80U_IS_INF(pr80Val))
6302 {
6303 fFsw |= X86_FSW_IE;
6304 if (!(fFcw & X86_FCW_IM))
6305 {
6306 fFsw |= X86_FSW_ES | X86_FSW_B;
6307 pFpuRes->r80Result = *pr80Val;
6308 }
6309 else
6310 {
6311 pFpuRes->r80Result = g_r80Indefinite;
6312 }
6313 }
6314 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6315 {
6316 pFpuRes->r80Result = *pr80Val;
6317 fFsw |= X86_FSW_DE;
6318
6319 if (fFcw & X86_FCW_DM)
6320 {
6321 fFsw |= X86_FSW_UE | X86_FSW_PE;
6322
6323 if (!(fFcw & X86_FCW_UM) || !(fFcw & X86_FCW_PM))
6324 {
6325 fFsw |= X86_FSW_ES | X86_FSW_B;
6326 }
6327 }
6328 else
6329 {
6330 fFsw |= X86_FSW_ES | X86_FSW_B;
6331 }
6332 }
6333 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6334 {
6335 pFpuRes->r80Result = *pr80Val;
6336 fFsw |= X86_FSW_DE;
6337
6338 if (fFcw & X86_FCW_DM)
6339 {
6340 if (fFcw & X86_FCW_PM)
6341 {
6342 fFsw |= X86_FSW_PE;
6343 }
6344 else
6345 {
6346 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6347 }
6348
6349 pFpuRes->r80Result.sj64.uExponent = 1;
6350 }
6351 else
6352 {
6353 fFsw |= X86_FSW_ES | X86_FSW_B;
6354 }
6355 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6356 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6357 {
6358 pFpuRes->r80Result = *pr80Val;
6359 } else {
6360 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6361 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6362 && (fFcw & X86_FCW_IM))
6363 pFpuRes->r80Result = g_r80Indefinite;
6364 else
6365 {
6366 pFpuRes->r80Result = *pr80Val;
6367 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6368 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6369 }
6370
6371 fFsw |= X86_FSW_IE;
6372 if (!(fFcw & X86_FCW_IM))
6373 fFsw |= X86_FSW_ES | X86_FSW_B;
6374 }
6375
6376 pFpuRes->FSW = fFsw;
6377}
6378#endif /* IEM_WITHOUT_ASSEMBLY */
6379
6380IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6381{
6382 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6383}
6384
6385IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6386{
6387 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6388}
6389
6390#ifdef IEM_WITHOUT_ASSEMBLY
6391
6392static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6393{
6394 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6395 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6396 extFloat80_t v;
6397 (void)fFcw;
6398
6399 v = extF80_cos(x, &SoftState);
6400
6401 iemFpuSoftF80ToIprt(pr80Result, v);
6402
6403 return fFsw;
6404}
6405
6406IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6407{
6408 uint16_t const fFcw = pFpuState->FCW;
6409 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6410
6411 if (RTFLOAT80U_IS_ZERO(pr80Val))
6412 {
6413 pFpuRes->r80Result = g_ar80One[0];
6414 }
6415 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6416 {
6417 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6418 {
6419 fFsw |= X86_FSW_C2;
6420 pFpuRes->r80Result = *pr80Val;
6421 }
6422 else
6423 {
6424 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6425 {
6426 pFpuRes->r80Result = g_ar80One[0];
6427
6428 }
6429 else
6430 {
6431 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6432 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6433 }
6434 fFsw |= X86_FSW_PE;
6435 if (!(fFcw & X86_FCW_PM))
6436 fFsw |= X86_FSW_ES | X86_FSW_B;
6437 }
6438 }
6439 else if (RTFLOAT80U_IS_INF(pr80Val))
6440 {
6441 fFsw |= X86_FSW_IE;
6442 if (!(fFcw & X86_FCW_IM))
6443 {
6444 fFsw |= X86_FSW_ES | X86_FSW_B;
6445 pFpuRes->r80Result = *pr80Val;
6446 }
6447 else
6448 {
6449 pFpuRes->r80Result = g_r80Indefinite;
6450 }
6451 }
6452 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6453 {
6454 fFsw |= X86_FSW_DE;
6455
6456 if (fFcw & X86_FCW_DM)
6457 {
6458 pFpuRes->r80Result = g_ar80One[0];
6459
6460 if (fFcw & X86_FCW_PM)
6461 {
6462 fFsw |= X86_FSW_PE;
6463 }
6464 else
6465 {
6466 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6467 }
6468 }
6469 else
6470 {
6471 pFpuRes->r80Result = *pr80Val;
6472 fFsw |= X86_FSW_ES | X86_FSW_B;
6473 }
6474 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6475 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6476 {
6477 pFpuRes->r80Result = *pr80Val;
6478 } else {
6479 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6480 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6481 && (fFcw & X86_FCW_IM))
6482 pFpuRes->r80Result = g_r80Indefinite;
6483 else
6484 {
6485 pFpuRes->r80Result = *pr80Val;
6486 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6487 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6488 }
6489
6490 fFsw |= X86_FSW_IE;
6491 if (!(fFcw & X86_FCW_IM))
6492 fFsw |= X86_FSW_ES | X86_FSW_B;
6493 }
6494
6495 pFpuRes->FSW = fFsw;
6496}
6497#endif /* IEM_WITHOUT_ASSEMBLY */
6498
6499IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6500{
6501 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6502}
6503
6504IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6505{
6506 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6507}
6508
6509#ifdef IEM_WITHOUT_ASSEMBLY
6510
6511static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6512{
6513 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6514 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6515 extFloat80_t r80Sin, r80Cos;
6516 (void)fFcw;
6517
6518 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6519
6520 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6521 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6522
6523 return fFsw;
6524}
6525
6526IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6527{
6528 uint16_t const fFcw = pFpuState->FCW;
6529 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6530
6531 if (RTFLOAT80U_IS_ZERO(pr80Val))
6532 {
6533 pFpuResTwo->r80Result1 = *pr80Val;
6534 pFpuResTwo->r80Result2 = g_ar80One[0];
6535 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6536 }
6537 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6538 {
6539 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6540 {
6541 fFsw |= X86_FSW_C2;
6542
6543 if (fFcw & X86_FCW_IM)
6544 {
6545 pFpuResTwo->r80Result1 = g_r80Indefinite;
6546 }
6547 else
6548 {
6549 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6550 }
6551
6552 pFpuResTwo->r80Result2 = *pr80Val;
6553 }
6554 else
6555 {
6556 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6557
6558 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6559 {
6560 pFpuResTwo->r80Result1 = *pr80Val;
6561 pFpuResTwo->r80Result2 = g_ar80One[0];
6562 }
6563 else
6564 {
6565 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6566 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6567 }
6568 fFsw |= X86_FSW_PE;
6569 if (!(fFcw & X86_FCW_PM))
6570 fFsw |= X86_FSW_ES | X86_FSW_B;
6571 }
6572 }
6573 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6574 {
6575 fFsw |= X86_FSW_DE;
6576
6577 if (fFcw & X86_FCW_DM)
6578 {
6579 pFpuResTwo->r80Result1 = *pr80Val;
6580 pFpuResTwo->r80Result2 = g_ar80One[0];
6581 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6582
6583 if (fFcw & X86_FCW_PM)
6584 {
6585 fFsw |= X86_FSW_PE;
6586 }
6587 else
6588 {
6589 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6590 }
6591
6592 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6593 }
6594 else
6595 {
6596 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6597 pFpuResTwo->r80Result2 = *pr80Val;
6598 fFsw |= X86_FSW_ES | X86_FSW_B;
6599 }
6600 }
6601 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6602 {
6603 fFsw |= X86_FSW_DE;
6604
6605 if (fFcw & X86_FCW_DM)
6606 {
6607 pFpuResTwo->r80Result2 = g_ar80One[0];
6608
6609 if (fFcw & X86_FCW_UM)
6610 {
6611 pFpuResTwo->r80Result1 = *pr80Val;
6612 }
6613 else
6614 {
6615 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6616 uint64_t uMantissa = pr80Val->s.uMantissa;
6617 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6618
6619 uExponent = 64 - uExponent;
6620 uMantissa <<= uExponent;
6621 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6622
6623 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6624 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6625 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6626 }
6627
6628 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6629 fFsw |= X86_FSW_UE | X86_FSW_PE;
6630
6631 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6632 {
6633 /* All the exceptions are masked. */
6634 }
6635 else
6636 {
6637 fFsw |= X86_FSW_ES | X86_FSW_B;
6638 }
6639 }
6640 else
6641 {
6642 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6643 pFpuResTwo->r80Result2 = *pr80Val;
6644 fFsw |= X86_FSW_ES | X86_FSW_B;
6645 }
6646 }
6647 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6648 {
6649 pFpuResTwo->r80Result1 = *pr80Val;
6650 pFpuResTwo->r80Result2 = *pr80Val;
6651 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6652 }
6653 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6654 {
6655 if (fFcw & X86_FCW_IM)
6656 {
6657 pFpuResTwo->r80Result1 = g_r80Indefinite;
6658 pFpuResTwo->r80Result2 = g_r80Indefinite;
6659 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6660 }
6661 else
6662 {
6663 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6664 pFpuResTwo->r80Result2 = *pr80Val;
6665 }
6666
6667 fFsw |= X86_FSW_IE;
6668 if (!(fFcw & X86_FCW_IM))
6669 fFsw |= X86_FSW_ES | X86_FSW_B;
6670 }
6671 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6672 {
6673 pFpuResTwo->r80Result1 = *pr80Val;
6674 pFpuResTwo->r80Result2 = *pr80Val;
6675
6676 if (fFcw & X86_FCW_IM)
6677 {
6678 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6679 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6680 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6681 }
6682 else
6683 {
6684 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6685 pFpuResTwo->r80Result2 = *pr80Val;
6686 }
6687
6688 fFsw |= X86_FSW_IE;
6689 if (!(fFcw & X86_FCW_IM))
6690 fFsw |= X86_FSW_ES | X86_FSW_B;
6691 }
6692 else if (RTFLOAT80U_IS_INF(pr80Val))
6693 {
6694 if (fFcw & X86_FCW_IM)
6695 {
6696 pFpuResTwo->r80Result1 = g_r80Indefinite;
6697 pFpuResTwo->r80Result2 = g_r80Indefinite;
6698 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6699 }
6700 else
6701 {
6702 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6703 pFpuResTwo->r80Result2 = *pr80Val;
6704 }
6705
6706 fFsw |= X86_FSW_IE;
6707 if (!(fFcw & X86_FCW_IM))
6708 fFsw |= X86_FSW_ES | X86_FSW_B;
6709 }
6710
6711 pFpuResTwo->FSW = fFsw;
6712}
6713#endif /* IEM_WITHOUT_ASSEMBLY */
6714
6715IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6716{
6717 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6718}
6719
6720IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6721{
6722 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6723}
6724
6725#ifdef IEM_WITHOUT_ASSEMBLY
6726
6727
6728/*********************************************************************************************************************************
6729* x87 FPU Compare and Testing Operations *
6730*********************************************************************************************************************************/
6731
6732IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6733{
6734 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6735
6736 if (RTFLOAT80U_IS_ZERO(pr80Val))
6737 fFsw |= X86_FSW_C3;
6738 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6739 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6740 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6741 {
6742 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6743 if (!(pFpuState->FCW & X86_FCW_DM))
6744 fFsw |= X86_FSW_ES | X86_FSW_B;
6745 }
6746 else
6747 {
6748 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6749 if (!(pFpuState->FCW & X86_FCW_IM))
6750 fFsw |= X86_FSW_ES | X86_FSW_B;
6751 }
6752
6753 *pu16Fsw = fFsw;
6754}
6755
6756
6757IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6758{
6759 RT_NOREF(pFpuState);
6760 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6761
6762 /* C1 = sign bit (always, even if empty Intel says). */
6763 if (pr80Val->s.fSign)
6764 fFsw |= X86_FSW_C1;
6765
6766 /* Classify the value in C0, C2, C3. */
6767 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6768 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6769 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6770 fFsw |= X86_FSW_C2;
6771 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6772 fFsw |= X86_FSW_C3;
6773 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6774 fFsw |= X86_FSW_C0;
6775 else if (RTFLOAT80U_IS_INF(pr80Val))
6776 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6777 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6778 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6779 /* whatever else: 0 */
6780
6781 *pu16Fsw = fFsw;
6782}
6783
6784
6785/**
6786 * Worker for fcom, fucom, and friends.
6787 */
6788static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6789 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6790{
6791 /*
6792 * Unpack the values.
6793 */
6794 bool const fSign1 = pr80Val1->s.fSign;
6795 int32_t iExponent1 = pr80Val1->s.uExponent;
6796 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6797
6798 bool const fSign2 = pr80Val2->s.fSign;
6799 int32_t iExponent2 = pr80Val2->s.uExponent;
6800 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6801
6802 /*
6803 * Check for invalid inputs.
6804 */
6805 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6806 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6807 {
6808 if (!(fFcw & X86_FCW_IM))
6809 fFsw |= X86_FSW_ES | X86_FSW_B;
6810 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6811 }
6812
6813 /*
6814 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6815 */
6816 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6817 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6818 {
6819 if ( fIeOnAllNaNs
6820 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6821 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6822 {
6823 fFsw |= X86_FSW_IE;
6824 if (!(fFcw & X86_FCW_IM))
6825 fFsw |= X86_FSW_ES | X86_FSW_B;
6826 }
6827 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6828 }
6829
6830 /*
6831 * Normalize the values.
6832 */
6833 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6834 {
6835 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6836 iExponent1 = 1;
6837 else
6838 {
6839 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6840 uMantissa1 <<= iExponent1;
6841 iExponent1 = 1 - iExponent1;
6842 }
6843 fFsw |= X86_FSW_DE;
6844 if (!(fFcw & X86_FCW_DM))
6845 fFsw |= X86_FSW_ES | X86_FSW_B;
6846 }
6847
6848 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6849 {
6850 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6851 iExponent2 = 1;
6852 else
6853 {
6854 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6855 uMantissa2 <<= iExponent2;
6856 iExponent2 = 1 - iExponent2;
6857 }
6858 fFsw |= X86_FSW_DE;
6859 if (!(fFcw & X86_FCW_DM))
6860 fFsw |= X86_FSW_ES | X86_FSW_B;
6861 }
6862
6863 /*
6864 * Test if equal (val1 == val2):
6865 */
6866 if ( uMantissa1 == uMantissa2
6867 && iExponent1 == iExponent2
6868 && ( fSign1 == fSign2
6869 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6870 fFsw |= X86_FSW_C3;
6871 /*
6872 * Test if less than (val1 < val2):
6873 */
6874 else if (fSign1 && !fSign2)
6875 fFsw |= X86_FSW_C0;
6876 else if (fSign1 == fSign2)
6877 {
6878 /* Zeros are problematic, however at the most one can be zero here. */
6879 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6880 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6881 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6882 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6883
6884 if ( fSign1
6885 ^ ( iExponent1 < iExponent2
6886 || ( iExponent1 == iExponent2
6887 && uMantissa1 < uMantissa2 ) ) )
6888 fFsw |= X86_FSW_C0;
6889 }
6890 /* else: No flags set if greater. */
6891
6892 return fFsw;
6893}
6894
6895
6896IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6897 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6898{
6899 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6900}
6901
6902
6903
6904
6905IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6906 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6907{
6908 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6909}
6910
6911
6912IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6913 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6914{
6915 RTFLOAT80U r80Val2;
6916 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6917 Assert(!fFsw || fFsw == X86_FSW_DE);
6918 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6919 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6920 {
6921 if (!(pFpuState->FCW & X86_FCW_DM))
6922 fFsw |= X86_FSW_ES | X86_FSW_B;
6923 *pfFsw |= fFsw;
6924 }
6925}
6926
6927
6928IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6929 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6930{
6931 RTFLOAT80U r80Val2;
6932 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6933 Assert(!fFsw || fFsw == X86_FSW_DE);
6934 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6935 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6936 {
6937 if (!(pFpuState->FCW & X86_FCW_DM))
6938 fFsw |= X86_FSW_ES | X86_FSW_B;
6939 *pfFsw |= fFsw;
6940 }
6941}
6942
6943
6944IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6945 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6946{
6947 RTFLOAT80U r80Val2;
6948 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6949 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6950}
6951
6952
6953IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6954 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6955{
6956 RTFLOAT80U r80Val2;
6957 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6958 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6959}
6960
6961
6962/**
6963 * Worker for fcomi & fucomi.
6964 */
6965static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6966 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6967{
6968 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6969 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6970 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6971 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6972
6973 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6974 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6975 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6976}
6977
6978
6979IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6980 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6981{
6982 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6983}
6984
6985
6986IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6987 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6988{
6989 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6990}
6991
6992
6993/*********************************************************************************************************************************
6994* x87 FPU Other Operations *
6995*********************************************************************************************************************************/
6996
6997/**
6998 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6999 */
7000static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7001{
7002 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7003 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7004 true /*exact / generate #PE */, &SoftState));
7005 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7006}
7007
7008
7009IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7010{
7011 uint16_t const fFcw = pFpuState->FCW;
7012 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7013
7014 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7015 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7016 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7017 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7018 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7019 || RTFLOAT80U_IS_INF(pr80Val))
7020 pFpuRes->r80Result = *pr80Val;
7021 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7022 {
7023 fFsw |= X86_FSW_DE;
7024 if (fFcw & X86_FCW_DM)
7025 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7026 else
7027 {
7028 pFpuRes->r80Result = *pr80Val;
7029 fFsw |= X86_FSW_ES | X86_FSW_B;
7030 }
7031 }
7032 else
7033 {
7034 if (fFcw & X86_FCW_IM)
7035 {
7036 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7037 pFpuRes->r80Result = g_r80Indefinite;
7038 else
7039 {
7040 pFpuRes->r80Result = *pr80Val;
7041 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7042 }
7043 }
7044 else
7045 {
7046 pFpuRes->r80Result = *pr80Val;
7047 fFsw |= X86_FSW_ES | X86_FSW_B;
7048 }
7049 fFsw |= X86_FSW_IE;
7050 }
7051 pFpuRes->FSW = fFsw;
7052}
7053
7054
7055IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7056 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7057{
7058 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7059 it does everything we need it to do. */
7060 uint16_t const fFcw = pFpuState->FCW;
7061 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7062 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7063 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7064 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7065}
7066
7067
7068/**
7069 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7070 */
7071static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7072{
7073 Assert(!pr80Val->s.fSign);
7074 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7075 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7076 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7077}
7078
7079
7080IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7081{
7082 uint16_t const fFcw = pFpuState->FCW;
7083 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7084
7085 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7086 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7087 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7088 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7089 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7090 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7091 pFpuRes->r80Result = *pr80Val;
7092 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7093 {
7094 fFsw |= X86_FSW_DE;
7095 if (fFcw & X86_FCW_DM)
7096 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7097 else
7098 {
7099 pFpuRes->r80Result = *pr80Val;
7100 fFsw |= X86_FSW_ES | X86_FSW_B;
7101 }
7102 }
7103 else
7104 {
7105 if (fFcw & X86_FCW_IM)
7106 {
7107 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7108 pFpuRes->r80Result = g_r80Indefinite;
7109 else
7110 {
7111 pFpuRes->r80Result = *pr80Val;
7112 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7113 }
7114 }
7115 else
7116 {
7117 pFpuRes->r80Result = *pr80Val;
7118 fFsw |= X86_FSW_ES | X86_FSW_B;
7119 }
7120 fFsw |= X86_FSW_IE;
7121 }
7122 pFpuRes->FSW = fFsw;
7123}
7124
7125
7126/**
7127 * @code{.unparsed}
7128 * x x * ln2
7129 * f(x) = 2 - 1 = e - 1
7130 *
7131 * @endcode
7132 *
7133 * We can approximate e^x by a Taylor/Maclaurin series (see
7134 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7135 * @code{.unparsed}
7136 * n 0 1 2 3 4
7137 * inf x x x x x x
7138 * SUM ----- = --- + --- + --- + --- + --- + ...
7139 * n=0 n! 0! 1! 2! 3! 4!
7140 *
7141 * 2 3 4
7142 * x x x
7143 * = 1 + x + --- + --- + --- + ...
7144 * 2! 3! 4!
7145 * @endcode
7146 *
7147 * Given z = x * ln2, we get:
7148 * @code{.unparsed}
7149 * 2 3 4 n
7150 * z z z z z
7151 * e - 1 = z + --- + --- + --- + ... + ---
7152 * 2! 3! 4! n!
7153 * @endcode
7154 *
7155 * Wanting to use Horner's method, we move one z outside and get:
7156 * @code{.unparsed}
7157 * 2 3 (n-1)
7158 * z z z z
7159 * = z ( 1 + --- + --- + --- + ... + ------- )
7160 * 2! 3! 4! n!
7161 * @endcode
7162 *
7163 * The constants we need for using Horner's methods are 1 and 1 / n!.
7164 *
7165 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7166 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7167 * and can approximate it to be 1.0. For a visual demonstration of this
7168 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7169 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7170 *
7171 *
7172 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7173 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7174 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7175 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7176 * blocks). (The one bit difference is probably an implicit one missing from
7177 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7178 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7179 * exponent.
7180 *
7181 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7182 * successfully reproduced the exact results from an Intel 10980XE, there is
7183 * always a portition of rounding differences. Not going to spend too much time
7184 * on getting this 100% the same, at least not now.
7185 *
7186 * P.S. If someone are really curious about 8087 and its contstants:
7187 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7188 *
7189 *
7190 * @param pr80Val The exponent value (x), less than 1.0, greater than
7191 * -1.0 and not zero. This can be a normal, denormal
7192 * or pseudo-denormal value.
7193 * @param pr80Result Where to return the result.
7194 * @param fFcw FPU control word.
7195 * @param fFsw FPU status word.
7196 */
7197static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7198{
7199 /* As mentioned above, we can skip the expensive polynomial calculation
7200 as it will be close enough to 1.0 that it makes no difference.
7201
7202 The cutoff point for intel 10980XE is exponents >= -69. Intel
7203 also seems to be using a 67-bit or 68-bit constant value, and we get
7204 a smattering of rounding differences if we go for higher precision. */
7205 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7206 {
7207 RTUINT256U u256;
7208 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7209 u256.QWords.qw0 |= 1; /* force #PE */
7210 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7211 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7212 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7213 : 1 - RTFLOAT80U_EXP_BIAS,
7214 fFcw, fFsw);
7215 }
7216 else
7217 {
7218#ifdef IEM_WITH_FLOAT128_FOR_FPU
7219 /* This approach is not good enough for small values, we end up with zero. */
7220 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7221 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7222 _Float128 rd128Result = powf128(2.0L, rd128Val);
7223 rd128Result -= 1.0L;
7224 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7225 iemFpuF128RestoreRounding(fOldRounding);
7226
7227# else
7228 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7229 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7230
7231 /* As mentioned above, enforce 68-bit internal mantissa width to better
7232 match the Intel 10980XE results. */
7233 unsigned const cPrecision = 68;
7234
7235 /* first calculate z = x * ln2 */
7236 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7237 cPrecision);
7238
7239 /* Then do the polynomial evaluation. */
7240 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7241 cPrecision, &SoftState);
7242 r = f128_mul(z, r, &SoftState);
7243
7244 /* Output the result. */
7245 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7246# endif
7247 }
7248 return fFsw;
7249}
7250
7251
7252IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7253{
7254 uint16_t const fFcw = pFpuState->FCW;
7255 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7256
7257 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7258 {
7259 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7260 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7261 else
7262 {
7263 /* Special case:
7264 2^+1.0 - 1.0 = 1.0
7265 2^-1.0 - 1.0 = -0.5 */
7266 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7267 && pr80Val->s.uMantissa == RT_BIT_64(63))
7268 {
7269 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7270 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7271 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7272 }
7273 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7274 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7275 else
7276 pFpuRes->r80Result = *pr80Val;
7277 fFsw |= X86_FSW_PE;
7278 if (!(fFcw & X86_FCW_PM))
7279 fFsw |= X86_FSW_ES | X86_FSW_B;
7280 }
7281 }
7282 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7283 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7284 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7285 pFpuRes->r80Result = *pr80Val;
7286 else if (RTFLOAT80U_IS_INF(pr80Val))
7287 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7288 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7289 {
7290 fFsw |= X86_FSW_DE;
7291 if (fFcw & X86_FCW_DM)
7292 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7293 else
7294 {
7295 pFpuRes->r80Result = *pr80Val;
7296 fFsw |= X86_FSW_ES | X86_FSW_B;
7297 }
7298 }
7299 else
7300 {
7301 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7302 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7303 && (fFcw & X86_FCW_IM))
7304 pFpuRes->r80Result = g_r80Indefinite;
7305 else
7306 {
7307 pFpuRes->r80Result = *pr80Val;
7308 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7309 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7310 }
7311 fFsw |= X86_FSW_IE;
7312 if (!(fFcw & X86_FCW_IM))
7313 fFsw |= X86_FSW_ES | X86_FSW_B;
7314 }
7315 pFpuRes->FSW = fFsw;
7316}
7317
7318#endif /* IEM_WITHOUT_ASSEMBLY */
7319
7320IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7321{
7322 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7323}
7324
7325IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7326{
7327 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7328}
7329
7330#ifdef IEM_WITHOUT_ASSEMBLY
7331
7332IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7333{
7334 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7335 pFpuRes->r80Result = *pr80Val;
7336 pFpuRes->r80Result.s.fSign = 0;
7337}
7338
7339
7340IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7341{
7342 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7343 pFpuRes->r80Result = *pr80Val;
7344 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7345}
7346
7347
7348IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7349{
7350 uint16_t const fFcw = pFpuState->FCW;
7351 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7352
7353 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7354 {
7355 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7356 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7357
7358 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7359 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7360 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7361 }
7362 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7363 {
7364 fFsw |= X86_FSW_ZE;
7365 if (fFcw & X86_FCW_ZM)
7366 {
7367 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7368 pFpuResTwo->r80Result2 = *pr80Val;
7369 }
7370 else
7371 {
7372 pFpuResTwo->r80Result2 = *pr80Val;
7373 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7374 }
7375 }
7376 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7377 {
7378 fFsw |= X86_FSW_DE;
7379 if (fFcw & X86_FCW_DM)
7380 {
7381 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7382 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7383 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7384 int32_t iExponent = -16382;
7385 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7386 {
7387 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7388 iExponent--;
7389 }
7390
7391 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7392 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7393 }
7394 else
7395 {
7396 pFpuResTwo->r80Result2 = *pr80Val;
7397 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7398 }
7399 }
7400 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7401 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7402 {
7403 pFpuResTwo->r80Result1 = *pr80Val;
7404 pFpuResTwo->r80Result2 = *pr80Val;
7405 }
7406 else if (RTFLOAT80U_IS_INF(pr80Val))
7407 {
7408 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7409 pFpuResTwo->r80Result2 = *pr80Val;
7410 }
7411 else
7412 {
7413 if (fFcw & X86_FCW_IM)
7414 {
7415 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7416 pFpuResTwo->r80Result1 = g_r80Indefinite;
7417 else
7418 {
7419 pFpuResTwo->r80Result1 = *pr80Val;
7420 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7421 }
7422 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7423 }
7424 else
7425 {
7426 pFpuResTwo->r80Result2 = *pr80Val;
7427 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7428 }
7429 fFsw |= X86_FSW_IE;
7430 }
7431 pFpuResTwo->FSW = fFsw;
7432}
7433#endif /* IEM_WITHOUT_ASSEMBLY */
7434
7435#if defined(IEM_WITHOUT_ASSEMBLY)
7436
7437static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7438{
7439 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7440 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7441 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7442 extFloat80_t v;
7443 (void)fFcw;
7444
7445 v = extF80_ylog2x(y, x, &SoftState);
7446 iemFpuSoftF80ToIprt(pr80Result, v);
7447
7448 return fFsw;
7449}
7450
7451IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7452 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7453{
7454 uint16_t const fFcw = pFpuState->FCW;
7455 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7456
7457 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7458 {
7459 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7460
7461 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7462 if (!(fFcw & X86_FCW_PM))
7463 fFsw |= X86_FSW_ES | X86_FSW_B;
7464 }
7465 else
7466 {
7467 fFsw |= X86_FSW_IE;
7468
7469 if (!(fFcw & X86_FCW_IM))
7470 {
7471 pFpuRes->r80Result = *pr80Val2;
7472 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7473 }
7474 else
7475 {
7476 pFpuRes->r80Result = g_r80Indefinite;
7477 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7478 }
7479 }
7480
7481 pFpuRes->FSW = fFsw;
7482}
7483#endif /* IEM_WITHOUT_ASSEMBLY */
7484
7485IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7486 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7487{
7488 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7489}
7490
7491IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7492 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7493{
7494 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7495}
7496
7497#if defined(IEM_WITHOUT_ASSEMBLY)
7498
7499static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7500{
7501 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7502 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7503 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7504 extFloat80_t v;
7505 (void)fFcw;
7506
7507 v = extF80_ylog2xp1(y, x, &SoftState);
7508 iemFpuSoftF80ToIprt(pr80Result, v);
7509
7510 return fFsw;
7511}
7512
7513IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7514 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7515{
7516 uint16_t const fFcw = pFpuState->FCW;
7517 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7518
7519 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7520 {
7521 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7522
7523 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7524 if (!(fFcw & X86_FCW_PM))
7525 fFsw |= X86_FSW_ES | X86_FSW_B;
7526 }
7527 else
7528 {
7529 fFsw |= X86_FSW_IE;
7530
7531 if (!(fFcw & X86_FCW_IM))
7532 {
7533 pFpuRes->r80Result = *pr80Val2;
7534 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7535 }
7536 else
7537 {
7538 pFpuRes->r80Result = g_r80Indefinite;
7539 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7540 }
7541 }
7542
7543 pFpuRes->FSW = fFsw;
7544}
7545
7546#endif /* IEM_WITHOUT_ASSEMBLY */
7547
7548IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7549 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7550{
7551 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7552}
7553
7554IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7555 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7556{
7557 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7558}
7559
7560
7561/*********************************************************************************************************************************
7562* MMX, SSE & AVX *
7563*********************************************************************************************************************************/
7564
7565/*
7566 * MOVSLDUP / VMOVSLDUP
7567 */
7568IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7569{
7570 puDst->au32[0] = puSrc->au32[0];
7571 puDst->au32[1] = puSrc->au32[0];
7572 puDst->au32[2] = puSrc->au32[2];
7573 puDst->au32[3] = puSrc->au32[2];
7574}
7575
7576#ifdef IEM_WITH_VEX
7577
7578IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7579{
7580 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7581 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7582 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7583 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7584 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7585 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7586 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7587 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7588}
7589
7590
7591IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7592{
7593 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7594 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7595 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7596 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7597 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7598 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7599 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7600 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7601}
7602
7603#endif /* IEM_WITH_VEX */
7604
7605
7606/*
7607 * MOVSHDUP / VMOVSHDUP
7608 */
7609IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7610{
7611 puDst->au32[0] = puSrc->au32[1];
7612 puDst->au32[1] = puSrc->au32[1];
7613 puDst->au32[2] = puSrc->au32[3];
7614 puDst->au32[3] = puSrc->au32[3];
7615}
7616
7617#ifdef IEM_WITH_VEX
7618
7619IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7620{
7621 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7622 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7623 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7624 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7625 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7626 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7627 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7628 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7629}
7630
7631
7632IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7633{
7634 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7635 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7636 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7637 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7638 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7639 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7640 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7641 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7642}
7643
7644#endif /* IEM_WITH_VEX */
7645
7646
7647/*
7648 * MOVDDUP / VMOVDDUP
7649 */
7650IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7651{
7652 puDst->au64[0] = uSrc;
7653 puDst->au64[1] = uSrc;
7654}
7655
7656#ifdef IEM_WITH_VEX
7657
7658IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7659{
7660 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7661 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7662 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7663 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7664}
7665
7666IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7667{
7668 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7669 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7670 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7671 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7672}
7673
7674#endif /* IEM_WITH_VEX */
7675
7676
7677/*
7678 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7679 */
7680#ifdef IEM_WITHOUT_ASSEMBLY
7681
7682IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7683{
7684 RT_NOREF(pFpuState);
7685 *puDst &= *puSrc;
7686}
7687
7688
7689IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7690{
7691 RT_NOREF(pFpuState);
7692 puDst->au64[0] &= puSrc->au64[0];
7693 puDst->au64[1] &= puSrc->au64[1];
7694}
7695
7696#endif
7697
7698IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7699 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7700{
7701 RT_NOREF(pExtState);
7702 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7703 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7704}
7705
7706
7707IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7708 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7709{
7710 RT_NOREF(pExtState);
7711 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7712 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7713 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7714 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7715}
7716
7717
7718/*
7719 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7720 */
7721#ifdef IEM_WITHOUT_ASSEMBLY
7722
7723IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7724{
7725 RT_NOREF(pFpuState);
7726 *puDst = ~*puDst & *puSrc;
7727}
7728
7729
7730IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7731{
7732 RT_NOREF(pFpuState);
7733 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7734 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7735}
7736
7737#endif
7738
7739IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7740 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7741{
7742 RT_NOREF(pExtState);
7743 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7744 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7745}
7746
7747
7748IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7749 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7750{
7751 RT_NOREF(pExtState);
7752 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7753 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7754 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7755 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7756}
7757
7758
7759/*
7760 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7761 */
7762#ifdef IEM_WITHOUT_ASSEMBLY
7763
7764IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7765{
7766 RT_NOREF(pFpuState);
7767 *puDst |= *puSrc;
7768}
7769
7770
7771IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7772{
7773 RT_NOREF(pFpuState);
7774 puDst->au64[0] |= puSrc->au64[0];
7775 puDst->au64[1] |= puSrc->au64[1];
7776}
7777
7778#endif
7779
7780IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7781 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7782{
7783 RT_NOREF(pExtState);
7784 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7785 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7786}
7787
7788
7789IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7790 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7791{
7792 RT_NOREF(pExtState);
7793 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7794 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7795 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7796 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7797}
7798
7799
7800/*
7801 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7802 */
7803#ifdef IEM_WITHOUT_ASSEMBLY
7804
7805IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7806{
7807 RT_NOREF(pFpuState);
7808 *puDst ^= *puSrc;
7809}
7810
7811
7812IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7813{
7814 RT_NOREF(pFpuState);
7815 puDst->au64[0] ^= puSrc->au64[0];
7816 puDst->au64[1] ^= puSrc->au64[1];
7817}
7818
7819#endif
7820
7821IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7822 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7823{
7824 RT_NOREF(pExtState);
7825 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7826 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7827}
7828
7829
7830IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7831 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7832{
7833 RT_NOREF(pExtState);
7834 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7835 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7836 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7837 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7838}
7839
7840
7841/*
7842 * PCMPEQB / VPCMPEQB
7843 */
7844#ifdef IEM_WITHOUT_ASSEMBLY
7845
7846IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7847{
7848 RT_NOREF(pFpuState);
7849 RTUINT64U uSrc1 = { *puDst };
7850 RTUINT64U uSrc2 = { *puSrc };
7851 RTUINT64U uDst;
7852 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7853 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7854 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7855 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7856 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7857 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7858 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7859 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7860 *puDst = uDst.u;
7861}
7862
7863
7864IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7865{
7866 RT_NOREF(pFpuState);
7867 RTUINT128U uSrc1 = *puDst;
7868 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7869 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7870 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7871 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7872 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7873 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7874 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7875 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7876 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7877 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7878 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7879 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7880 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7881 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7882 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7883 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7884}
7885
7886#endif
7887
7888IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7889 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7890{
7891 RT_NOREF(pExtState);
7892 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7893 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7894 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7895 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7896 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7897 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7898 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7899 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7900 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7901 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7902 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7903 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7904 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7905 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7906 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7907 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7908}
7909
7910IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7911 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7912{
7913 RT_NOREF(pExtState);
7914 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7915 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7916 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7917 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7918 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7919 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7920 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7921 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7922 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7923 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7924 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7925 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7926 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7927 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7928 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7929 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7930 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7931 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7932 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7933 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7934 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7935 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7936 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7937 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7938 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7939 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7940 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7941 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7942 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7943 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7944 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7945 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7946}
7947
7948
7949/*
7950 * PCMPEQW / VPCMPEQW
7951 */
7952#ifdef IEM_WITHOUT_ASSEMBLY
7953
7954IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7955{
7956 RT_NOREF(pFpuState);
7957 RTUINT64U uSrc1 = { *puDst };
7958 RTUINT64U uSrc2 = { *puSrc };
7959 RTUINT64U uDst;
7960 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7961 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7962 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7963 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7964 *puDst = uDst.u;
7965}
7966
7967
7968IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7969{
7970 RT_NOREF(pFpuState);
7971 RTUINT128U uSrc1 = *puDst;
7972 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7973 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7974 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7975 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7976 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7977 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7978 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7979 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7980}
7981
7982#endif
7983
7984IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7985 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7986{
7987 RT_NOREF(pExtState);
7988 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7989 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7990 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7991 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7992 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7993 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7994 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7995 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7996}
7997
7998IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7999 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8000{
8001 RT_NOREF(pExtState);
8002 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8003 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8004 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8005 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8006 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8007 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8008 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8009 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8010 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8011 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8012 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8013 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8014 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8015 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8016 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8017 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8018}
8019
8020
8021/*
8022 * PCMPEQD / VPCMPEQD.
8023 */
8024#ifdef IEM_WITHOUT_ASSEMBLY
8025
8026IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8027{
8028 RT_NOREF(pFpuState);
8029 RTUINT64U uSrc1 = { *puDst };
8030 RTUINT64U uSrc2 = { *puSrc };
8031 RTUINT64U uDst;
8032 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8033 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8034 *puDst = uDst.u;
8035}
8036
8037
8038IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8039{
8040 RT_NOREF(pFpuState);
8041 RTUINT128U uSrc1 = *puDst;
8042 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8043 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8044 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8045 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8046}
8047
8048#endif /* IEM_WITHOUT_ASSEMBLY */
8049
8050IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8051 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8052{
8053 RT_NOREF(pExtState);
8054 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8055 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8056 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8057 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8058}
8059
8060IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8061 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8062{
8063 RT_NOREF(pExtState);
8064 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8065 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8066 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8067 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8068 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8069 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8070 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8071 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8072}
8073
8074
8075/*
8076 * PCMPEQQ / VPCMPEQQ.
8077 */
8078IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8079{
8080 RT_NOREF(pFpuState);
8081 RTUINT128U uSrc1 = *puDst;
8082 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8083 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8084}
8085
8086IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8087 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8088{
8089 RT_NOREF(pExtState);
8090 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8091 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8092}
8093
8094IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8095 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8096{
8097 RT_NOREF(pExtState);
8098 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8099 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8100 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8101 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8102}
8103
8104
8105/*
8106 * PCMPGTB / VPCMPGTB
8107 */
8108#ifdef IEM_WITHOUT_ASSEMBLY
8109
8110IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8111{
8112 RT_NOREF(pFpuState);
8113 RTUINT64U uSrc1 = { *puDst };
8114 RTUINT64U uSrc2 = { *puSrc };
8115 RTUINT64U uDst;
8116 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8117 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8118 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8119 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8120 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8121 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8122 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8123 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8124 *puDst = uDst.u;
8125}
8126
8127
8128IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8129{
8130 RT_NOREF(pFpuState);
8131 RTUINT128U uSrc1 = *puDst;
8132 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8133 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8134 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8135 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8136 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8137 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8138 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8139 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8140 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8141 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8142 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8143 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8144 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8145 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8146 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8147 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8148}
8149
8150#endif
8151
8152IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8153 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8154{
8155 RT_NOREF(pExtState);
8156 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8157 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8158 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8159 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8160 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8161 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8162 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8163 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8164 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8165 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8166 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8167 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8168 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8169 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8170 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8171 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8172}
8173
8174IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8175 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8176{
8177 RT_NOREF(pExtState);
8178 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8179 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8180 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8181 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8182 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8183 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8184 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8185 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8186 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8187 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8188 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8189 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8190 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8191 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8192 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8193 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8194 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8195 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8196 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8197 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8198 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8199 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8200 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8201 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8202 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8203 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8204 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8205 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8206 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8207 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8208 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8209 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8210}
8211
8212
8213/*
8214 * PCMPGTW / VPCMPGTW
8215 */
8216#ifdef IEM_WITHOUT_ASSEMBLY
8217
8218IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8219{
8220 RT_NOREF(pFpuState);
8221 RTUINT64U uSrc1 = { *puDst };
8222 RTUINT64U uSrc2 = { *puSrc };
8223 RTUINT64U uDst;
8224 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8225 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8226 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8227 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8228 *puDst = uDst.u;
8229}
8230
8231
8232IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8233{
8234 RT_NOREF(pFpuState);
8235 RTUINT128U uSrc1 = *puDst;
8236 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8237 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8238 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8239 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8240 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8241 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8242 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8243 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8244}
8245
8246#endif
8247
8248IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8249 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8250{
8251 RT_NOREF(pExtState);
8252 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8253 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8254 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8255 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8256 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8257 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8258 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8259 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8260}
8261
8262IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8263 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8264{
8265 RT_NOREF(pExtState);
8266 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8267 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8268 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8269 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8270 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8271 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8272 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8273 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8274 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8275 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8276 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8277 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8278 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8279 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8280 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8281 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8282}
8283
8284
8285/*
8286 * PCMPGTD / VPCMPGTD.
8287 */
8288#ifdef IEM_WITHOUT_ASSEMBLY
8289
8290IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8291{
8292 RT_NOREF(pFpuState);
8293 RTUINT64U uSrc1 = { *puDst };
8294 RTUINT64U uSrc2 = { *puSrc };
8295 RTUINT64U uDst;
8296 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8297 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8298 *puDst = uDst.u;
8299}
8300
8301
8302IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8303{
8304 RT_NOREF(pFpuState);
8305 RTUINT128U uSrc1 = *puDst;
8306 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8307 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8308 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8309 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8310}
8311
8312#endif /* IEM_WITHOUT_ASSEMBLY */
8313
8314IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8315 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8316{
8317 RT_NOREF(pExtState);
8318 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8319 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8320 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8321 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8322}
8323
8324IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8325 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8326{
8327 RT_NOREF(pExtState);
8328 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8329 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8330 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8331 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8332 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8333 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8334 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8335 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8336}
8337
8338
8339/*
8340 * PCMPGTQ / VPCMPGTQ.
8341 */
8342IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8343{
8344 RT_NOREF(pFpuState);
8345 RTUINT128U uSrc1 = *puDst;
8346 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8347 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8348}
8349
8350IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8351 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8352{
8353 RT_NOREF(pExtState);
8354 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8355 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8356}
8357
8358IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8359 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8360{
8361 RT_NOREF(pExtState);
8362 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8363 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8364 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8365 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8366}
8367
8368
8369/*
8370 * PADDB / VPADDB
8371 */
8372#ifdef IEM_WITHOUT_ASSEMBLY
8373
8374IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8375{
8376 RT_NOREF(pFpuState);
8377 RTUINT64U uSrc1 = { *puDst };
8378 RTUINT64U uSrc2 = { *puSrc };
8379 RTUINT64U uDst;
8380 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8381 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8382 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8383 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8384 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8385 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8386 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8387 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8388 *puDst = uDst.u;
8389}
8390
8391
8392IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8393{
8394 RT_NOREF(pFpuState);
8395 RTUINT128U uSrc1 = *puDst;
8396 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8397 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8398 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8399 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8400 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8401 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8402 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8403 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8404 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8405 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8406 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8407 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8408 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8409 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8410 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8411 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8412}
8413
8414#endif
8415
8416
8417IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8418 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8419{
8420 RT_NOREF(pExtState);
8421 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8422 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8423 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8424 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8425 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8426 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8427 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8428 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8429 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8430 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8431 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8432 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8433 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8434 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8435 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8436 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8437}
8438
8439IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8440 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8441{
8442 RT_NOREF(pExtState);
8443 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8444 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8445 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8446 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8447 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8448 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8449 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8450 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8451 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8452 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8453 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8454 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8455 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8456 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8457 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8458 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8459 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8460 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8461 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8462 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8463 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8464 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8465 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8466 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8467 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8468 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8469 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8470 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8471 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8472 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8473 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8474 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8475}
8476
8477
8478/*
8479 * PADDSB / VPADDSB
8480 */
8481#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8482 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8483 ? (uint8_t)(a_iWord) \
8484 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8485
8486#ifdef IEM_WITHOUT_ASSEMBLY
8487
8488IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8489{
8490 RT_NOREF(pFpuState);
8491 RTUINT64U uSrc1 = { *puDst };
8492 RTUINT64U uSrc2 = { *puSrc };
8493 RTUINT64U uDst;
8494 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8495 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8496 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8497 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8498 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8499 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8500 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8501 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8502 *puDst = uDst.u;
8503}
8504
8505
8506IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8507{
8508 RT_NOREF(pFpuState);
8509 RTUINT128U uSrc1 = *puDst;
8510 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8511 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8512 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8513 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8514 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8515 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8516 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8517 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8518 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8519 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8520 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8521 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8522 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8523 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8524 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8525 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8526}
8527
8528#endif
8529
8530
8531/*
8532 * PADDSB / VPADDSB
8533 */
8534#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8535 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8536 ? (uint8_t)(a_uWord) \
8537 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8538
8539#ifdef IEM_WITHOUT_ASSEMBLY
8540
8541IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8542{
8543 RT_NOREF(pFpuState);
8544 RTUINT64U uSrc1 = { *puDst };
8545 RTUINT64U uSrc2 = { *puSrc };
8546 RTUINT64U uDst;
8547 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8548 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8549 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8550 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8551 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8552 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8553 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8554 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8555 *puDst = uDst.u;
8556}
8557
8558
8559IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8560{
8561 RT_NOREF(pFpuState);
8562 RTUINT128U uSrc1 = *puDst;
8563 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8564 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8565 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8566 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8567 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8568 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8569 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8570 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8571 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8572 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8573 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8574 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8575 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8576 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8577 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8578 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8579}
8580
8581#endif
8582
8583
8584/*
8585 * PADDW / VPADDW
8586 */
8587#ifdef IEM_WITHOUT_ASSEMBLY
8588
8589IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8590{
8591 RT_NOREF(pFpuState);
8592 RTUINT64U uSrc1 = { *puDst };
8593 RTUINT64U uSrc2 = { *puSrc };
8594 RTUINT64U uDst;
8595 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8596 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8597 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8598 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8599 *puDst = uDst.u;
8600}
8601
8602
8603IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8604{
8605 RT_NOREF(pFpuState);
8606 RTUINT128U uSrc1 = *puDst;
8607 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8608 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8609 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8610 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8611 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8612 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8613 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8614 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8615}
8616
8617#endif
8618
8619
8620IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8621 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8622{
8623 RT_NOREF(pExtState);
8624 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8625 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8626 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8627 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8628 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8629 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8630 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8631 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8632}
8633
8634IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8635 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8636{
8637 RT_NOREF(pExtState);
8638 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8639 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8640 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8641 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8642 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8643 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8644 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8645 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8646 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8647 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8648 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8649 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8650 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8651 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8652 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8653 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8654}
8655
8656
8657/*
8658 * PADDSW / VPADDSW
8659 */
8660#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8661 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8662 ? (uint16_t)(a_iDword) \
8663 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8664
8665#ifdef IEM_WITHOUT_ASSEMBLY
8666
8667IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8668{
8669 RT_NOREF(pFpuState);
8670 RTUINT64U uSrc1 = { *puDst };
8671 RTUINT64U uSrc2 = { *puSrc };
8672 RTUINT64U uDst;
8673 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8674 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8675 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8676 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8677 *puDst = uDst.u;
8678}
8679
8680
8681IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8682{
8683 RT_NOREF(pFpuState);
8684 RTUINT128U uSrc1 = *puDst;
8685 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8686 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8687 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8688 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8689 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8690 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8691 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8692 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8693}
8694
8695#endif
8696
8697
8698/*
8699 * PADDUSW / VPADDUSW
8700 */
8701#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8702 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8703 ? (uint16_t)(a_uDword) \
8704 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8705
8706#ifdef IEM_WITHOUT_ASSEMBLY
8707
8708IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8709{
8710 RT_NOREF(pFpuState);
8711 RTUINT64U uSrc1 = { *puDst };
8712 RTUINT64U uSrc2 = { *puSrc };
8713 RTUINT64U uDst;
8714 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8715 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8716 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8717 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8718 *puDst = uDst.u;
8719}
8720
8721
8722IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8723{
8724 RT_NOREF(pFpuState);
8725 RTUINT128U uSrc1 = *puDst;
8726 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8727 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8728 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8729 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8730 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8731 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8732 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8733 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8734}
8735
8736#endif
8737
8738
8739/*
8740 * PADDD / VPADDD.
8741 */
8742#ifdef IEM_WITHOUT_ASSEMBLY
8743
8744IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8745{
8746 RT_NOREF(pFpuState);
8747 RTUINT64U uSrc1 = { *puDst };
8748 RTUINT64U uSrc2 = { *puSrc };
8749 RTUINT64U uDst;
8750 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8751 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8752 *puDst = uDst.u;
8753}
8754
8755
8756IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8757{
8758 RT_NOREF(pFpuState);
8759 RTUINT128U uSrc1 = *puDst;
8760 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8761 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8762 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8763 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8764}
8765
8766#endif /* IEM_WITHOUT_ASSEMBLY */
8767
8768IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8769 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8770{
8771 RT_NOREF(pExtState);
8772 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8773 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8774 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8775 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8776}
8777
8778IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8779 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8780{
8781 RT_NOREF(pExtState);
8782 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8783 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8784 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8785 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8786 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8787 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8788 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8789 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8790}
8791
8792
8793/*
8794 * PADDQ / VPADDQ.
8795 */
8796#ifdef IEM_WITHOUT_ASSEMBLY
8797
8798IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8799{
8800 RT_NOREF(pFpuState);
8801 *puDst = *puDst + *puSrc;
8802}
8803
8804IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8805{
8806 RT_NOREF(pFpuState);
8807 RTUINT128U uSrc1 = *puDst;
8808 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8809 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8810}
8811
8812#endif
8813
8814IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8815 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8816{
8817 RT_NOREF(pExtState);
8818 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8819 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8820}
8821
8822IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8823 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8824{
8825 RT_NOREF(pExtState);
8826 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8827 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8828 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8829 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8830}
8831
8832
8833/*
8834 * PSUBB / VPSUBB
8835 */
8836#ifdef IEM_WITHOUT_ASSEMBLY
8837
8838IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8839{
8840 RT_NOREF(pFpuState);
8841 RTUINT64U uSrc1 = { *puDst };
8842 RTUINT64U uSrc2 = { *puSrc };
8843 RTUINT64U uDst;
8844 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8845 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8846 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8847 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8848 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8849 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8850 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8851 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8852 *puDst = uDst.u;
8853}
8854
8855
8856IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8857{
8858 RT_NOREF(pFpuState);
8859 RTUINT128U uSrc1 = *puDst;
8860 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8861 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8862 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8863 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8864 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8865 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8866 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8867 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8868 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8869 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8870 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8871 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8872 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8873 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8874 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8875 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8876}
8877
8878#endif
8879
8880IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8881 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8882{
8883 RT_NOREF(pExtState);
8884 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8885 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8886 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8887 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8888 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8889 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8890 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8891 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8892 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8893 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8894 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8895 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8896 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8897 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8898 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8899 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8900}
8901
8902IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8903 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8904{
8905 RT_NOREF(pExtState);
8906 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8907 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8908 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8909 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8910 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8911 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8912 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8913 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8914 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8915 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8916 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8917 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8918 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8919 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8920 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8921 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8922 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8923 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8924 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8925 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8926 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8927 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8928 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8929 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8930 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8931 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8932 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8933 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8934 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8935 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8936 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8937 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8938}
8939
8940
8941/*
8942 * PSUBSB / VSUBSB
8943 */
8944#ifdef IEM_WITHOUT_ASSEMBLY
8945
8946IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8947{
8948 RT_NOREF(pFpuState);
8949 RTUINT64U uSrc1 = { *puDst };
8950 RTUINT64U uSrc2 = { *puSrc };
8951 RTUINT64U uDst;
8952 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8953 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8954 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8955 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8956 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8957 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8958 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8959 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8960 *puDst = uDst.u;
8961}
8962
8963
8964IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8965{
8966 RT_NOREF(pFpuState);
8967 RTUINT128U uSrc1 = *puDst;
8968 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8969 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8970 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8971 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8972 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8973 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8974 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8975 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8976 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8977 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8978 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8979 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8980 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8981 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8982 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8983 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8984}
8985
8986#endif
8987
8988
8989/*
8990 * PADDSB / VPADDSB
8991 */
8992#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8993 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8994 ? (uint8_t)(a_uWord) \
8995 : (uint8_t)0 )
8996
8997#ifdef IEM_WITHOUT_ASSEMBLY
8998
8999IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9000{
9001 RT_NOREF(pFpuState);
9002 RTUINT64U uSrc1 = { *puDst };
9003 RTUINT64U uSrc2 = { *puSrc };
9004 RTUINT64U uDst;
9005 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9006 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9007 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9008 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9009 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9010 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9011 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9012 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9013 *puDst = uDst.u;
9014}
9015
9016
9017IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9018{
9019 RT_NOREF(pFpuState);
9020 RTUINT128U uSrc1 = *puDst;
9021 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9022 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9023 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9024 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9025 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9026 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9027 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9028 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9029 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9030 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9031 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9032 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9033 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9034 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9035 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9036 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9037}
9038
9039#endif
9040
9041
9042/*
9043 * PSUBW / VPSUBW
9044 */
9045#ifdef IEM_WITHOUT_ASSEMBLY
9046
9047IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9048{
9049 RT_NOREF(pFpuState);
9050 RTUINT64U uSrc1 = { *puDst };
9051 RTUINT64U uSrc2 = { *puSrc };
9052 RTUINT64U uDst;
9053 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9054 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9055 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9056 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9057 *puDst = uDst.u;
9058}
9059
9060
9061IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9062{
9063 RT_NOREF(pFpuState);
9064 RTUINT128U uSrc1 = *puDst;
9065 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9066 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9067 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9068 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9069 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9070 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9071 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9072 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9073}
9074
9075#endif
9076
9077IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9078 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9079{
9080 RT_NOREF(pExtState);
9081 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9082 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9083 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9084 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9085 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9086 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9087 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9088 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9089}
9090
9091IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9092 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9093{
9094 RT_NOREF(pExtState);
9095 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9096 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9097 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9098 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9099 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9100 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9101 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9102 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9103 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9104 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9105 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9106 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9107 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9108 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9109 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9110 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9111}
9112
9113
9114/*
9115 * PSUBSW / VPSUBSW
9116 */
9117#ifdef IEM_WITHOUT_ASSEMBLY
9118
9119IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9120{
9121 RT_NOREF(pFpuState);
9122 RTUINT64U uSrc1 = { *puDst };
9123 RTUINT64U uSrc2 = { *puSrc };
9124 RTUINT64U uDst;
9125 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9126 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9127 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9128 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9129 *puDst = uDst.u;
9130}
9131
9132
9133IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9134{
9135 RT_NOREF(pFpuState);
9136 RTUINT128U uSrc1 = *puDst;
9137 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9138 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9139 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9140 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9141 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9142 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9143 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9144 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9145}
9146
9147#endif
9148
9149
9150/*
9151 * PSUBUSW / VPSUBUSW
9152 */
9153#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9154 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9155 ? (uint16_t)(a_uDword) \
9156 : (uint16_t)0 )
9157
9158#ifdef IEM_WITHOUT_ASSEMBLY
9159
9160IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9161{
9162 RT_NOREF(pFpuState);
9163 RTUINT64U uSrc1 = { *puDst };
9164 RTUINT64U uSrc2 = { *puSrc };
9165 RTUINT64U uDst;
9166 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9167 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9168 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9169 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9170 *puDst = uDst.u;
9171}
9172
9173
9174IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9175{
9176 RT_NOREF(pFpuState);
9177 RTUINT128U uSrc1 = *puDst;
9178 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9179 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9180 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9181 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9182 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9183 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9184 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9185 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9186}
9187
9188#endif
9189
9190
9191/*
9192 * PSUBD / VPSUBD.
9193 */
9194#ifdef IEM_WITHOUT_ASSEMBLY
9195
9196IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9197{
9198 RT_NOREF(pFpuState);
9199 RTUINT64U uSrc1 = { *puDst };
9200 RTUINT64U uSrc2 = { *puSrc };
9201 RTUINT64U uDst;
9202 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9203 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9204 *puDst = uDst.u;
9205}
9206
9207
9208IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9209{
9210 RT_NOREF(pFpuState);
9211 RTUINT128U uSrc1 = *puDst;
9212 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9213 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9214 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9215 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9216}
9217
9218#endif /* IEM_WITHOUT_ASSEMBLY */
9219
9220IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9221 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9222{
9223 RT_NOREF(pExtState);
9224 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9225 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9226 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9227 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9228}
9229
9230IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9231 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9232{
9233 RT_NOREF(pExtState);
9234 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9235 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9236 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9237 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9238 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9239 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9240 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9241 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9242}
9243
9244
9245/*
9246 * PSUBQ / VPSUBQ.
9247 */
9248#ifdef IEM_WITHOUT_ASSEMBLY
9249
9250IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9251{
9252 RT_NOREF(pFpuState);
9253 *puDst = *puDst - *puSrc;
9254}
9255
9256IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9257{
9258 RT_NOREF(pFpuState);
9259 RTUINT128U uSrc1 = *puDst;
9260 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9261 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9262}
9263
9264#endif
9265
9266IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9267 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9268{
9269 RT_NOREF(pExtState);
9270 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9271 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9272}
9273
9274IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9275 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9276{
9277 RT_NOREF(pExtState);
9278 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9279 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9280 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9281 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9282}
9283
9284
9285
9286/*
9287 * PMULLW / VPMULLW / PMULLD / VPMULLD
9288 */
9289#ifdef IEM_WITHOUT_ASSEMBLY
9290
9291IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9292{
9293 RT_NOREF(pFpuState);
9294 RTUINT64U uSrc1 = { *puDst };
9295 RTUINT64U uSrc2 = { *puSrc };
9296 RTUINT64U uDst;
9297 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9298 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9299 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9300 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9301 *puDst = uDst.u;
9302}
9303
9304
9305IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9306{
9307 RT_NOREF(pFpuState);
9308 RTUINT128U uSrc1 = *puDst;
9309 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9310 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9311 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9312 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9313 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9314 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9315 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9316 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9317}
9318
9319#endif
9320
9321IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9322{
9323 RTUINT128U uSrc1 = *puDst;
9324
9325 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9326 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9327 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9328 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9329 RT_NOREF(pFpuState);
9330}
9331
9332
9333IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9334{
9335 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9336 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9337 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9338 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9339 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9340 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9341 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9342 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9343}
9344
9345
9346IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9347{
9348 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9349 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9350 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9351 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9352 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9353 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9354 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9355 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9356 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9357 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9358 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9359 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9360 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9361 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9362 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9363 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9364}
9365
9366
9367IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9368{
9369 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9370 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9371 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9372 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9373}
9374
9375
9376IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9377{
9378 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9379 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9380 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9381 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9382 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9383 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9384 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9385 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9386}
9387
9388
9389/*
9390 * PMULHW / VPMULHW
9391 */
9392#ifdef IEM_WITHOUT_ASSEMBLY
9393
9394IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9395{
9396 RT_NOREF(pFpuState);
9397 RTUINT64U uSrc1 = { *puDst };
9398 RTUINT64U uSrc2 = { *puSrc };
9399 RTUINT64U uDst;
9400 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9401 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9402 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9403 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9404 *puDst = uDst.u;
9405}
9406
9407
9408IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9409{
9410 RT_NOREF(pFpuState);
9411 RTUINT128U uSrc1 = *puDst;
9412 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9413 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9414 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9415 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9416 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9417 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9418 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9419 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9420}
9421
9422#endif
9423
9424IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9425{
9426 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9427 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9428 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9429 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9430 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9431 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9432 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9433 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9434}
9435
9436
9437IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9438{
9439 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9440 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9441 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9442 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9443 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9444 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9445 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9446 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9447 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9448 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9449 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9450 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9451 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9452 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9453 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9454 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9455}
9456
9457
9458/*
9459 * PMULHUW / VPMULHUW
9460 */
9461#ifdef IEM_WITHOUT_ASSEMBLY
9462
9463IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9464{
9465 RTUINT64U uSrc1 = { *puDst };
9466 RTUINT64U uSrc2 = { *puSrc };
9467 RTUINT64U uDst;
9468 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9469 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9470 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9471 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9472 *puDst = uDst.u;
9473}
9474
9475
9476IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9477{
9478 RTUINT128U uSrc1 = *puDst;
9479 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9480 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9481 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9482 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9483 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9484 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9485 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9486 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9487}
9488
9489#endif
9490
9491IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9492{
9493 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9494 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9495 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9496 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9497 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9498 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9499 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9500 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9501}
9502
9503
9504IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9505{
9506 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9507 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9508 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9509 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9510 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9511 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9512 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9513 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9514 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9515 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9516 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9517 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9518 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9519 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9520 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9521 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9522}
9523
9524
9525/*
9526 * PSRLW / VPSRLW
9527 */
9528#ifdef IEM_WITHOUT_ASSEMBLY
9529
9530IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9531{
9532 RTUINT64U uSrc1 = { *puDst };
9533 RTUINT64U uSrc2 = { *puSrc };
9534 RTUINT64U uDst;
9535
9536 if (uSrc2.au64[0] <= 15)
9537 {
9538 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9539 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9540 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9541 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9542 }
9543 else
9544 {
9545 uDst.au64[0] = 0;
9546 }
9547 *puDst = uDst.u;
9548}
9549
9550
9551IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9552{
9553 RTUINT64U uSrc1 = { *puDst };
9554 RTUINT64U uDst;
9555
9556 if (uShift <= 15)
9557 {
9558 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9559 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9560 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9561 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9562 }
9563 else
9564 {
9565 uDst.au64[0] = 0;
9566 }
9567 *puDst = uDst.u;
9568}
9569
9570
9571IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9572{
9573 RTUINT128U uSrc1 = *puDst;
9574
9575 if (puSrc->au64[0] <= 15)
9576 {
9577 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9578 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9579 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9580 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9581 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9582 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9583 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9584 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9585 }
9586 else
9587 {
9588 puDst->au64[0] = 0;
9589 puDst->au64[1] = 0;
9590 }
9591}
9592
9593IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9594{
9595 RTUINT128U uSrc1 = *puDst;
9596
9597 if (uShift <= 15)
9598 {
9599 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9600 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9601 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9602 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9603 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9604 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9605 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9606 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9607 }
9608 else
9609 {
9610 puDst->au64[0] = 0;
9611 puDst->au64[1] = 0;
9612 }
9613}
9614
9615#endif
9616
9617
9618/*
9619 * PSRAW / VPSRAW
9620 */
9621#ifdef IEM_WITHOUT_ASSEMBLY
9622
9623IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9624{
9625 RTUINT64U uSrc1 = { *puDst };
9626 RTUINT64U uSrc2 = { *puSrc };
9627 RTUINT64U uDst;
9628
9629 if (uSrc2.au64[0] <= 15)
9630 {
9631 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9632 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9633 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9634 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9635 }
9636 else
9637 {
9638 uDst.au64[0] = 0;
9639 }
9640 *puDst = uDst.u;
9641}
9642
9643
9644IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9645{
9646 RTUINT64U uSrc1 = { *puDst };
9647 RTUINT64U uDst;
9648
9649 if (uShift <= 15)
9650 {
9651 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9652 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9653 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9654 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9655 }
9656 else
9657 {
9658 uDst.au64[0] = 0;
9659 }
9660 *puDst = uDst.u;
9661}
9662
9663
9664IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9665{
9666 RTUINT128U uSrc1 = *puDst;
9667
9668 if (puSrc->au64[0] <= 15)
9669 {
9670 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9671 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9672 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9673 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9674 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9675 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9676 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9677 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9678 }
9679 else
9680 {
9681 puDst->au64[0] = 0;
9682 puDst->au64[1] = 0;
9683 }
9684}
9685
9686IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9687{
9688 RTUINT128U uSrc1 = *puDst;
9689
9690 if (uShift <= 15)
9691 {
9692 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9693 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9694 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9695 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9696 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9697 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9698 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9699 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9700 }
9701 else
9702 {
9703 puDst->au64[0] = 0;
9704 puDst->au64[1] = 0;
9705 }
9706}
9707
9708#endif
9709
9710
9711/*
9712 * PSLLW / VPSLLW
9713 */
9714#ifdef IEM_WITHOUT_ASSEMBLY
9715
9716IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9717{
9718 RTUINT64U uSrc1 = { *puDst };
9719 RTUINT64U uSrc2 = { *puSrc };
9720 RTUINT64U uDst;
9721
9722 if (uSrc2.au64[0] <= 15)
9723 {
9724 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9725 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9726 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9727 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9728 }
9729 else
9730 {
9731 uDst.au64[0] = 0;
9732 }
9733 *puDst = uDst.u;
9734}
9735
9736
9737IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9738{
9739 RTUINT64U uSrc1 = { *puDst };
9740 RTUINT64U uDst;
9741
9742 if (uShift <= 15)
9743 {
9744 uDst.au16[0] = uSrc1.au16[0] << uShift;
9745 uDst.au16[1] = uSrc1.au16[1] << uShift;
9746 uDst.au16[2] = uSrc1.au16[2] << uShift;
9747 uDst.au16[3] = uSrc1.au16[3] << uShift;
9748 }
9749 else
9750 {
9751 uDst.au64[0] = 0;
9752 }
9753 *puDst = uDst.u;
9754}
9755
9756
9757IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9758{
9759 RTUINT128U uSrc1 = *puDst;
9760
9761 if (puSrc->au64[0] <= 15)
9762 {
9763 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9764 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9765 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9766 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9767 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9768 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9769 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9770 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9771 }
9772 else
9773 {
9774 puDst->au64[0] = 0;
9775 puDst->au64[1] = 0;
9776 }
9777}
9778
9779IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9780{
9781 RTUINT128U uSrc1 = *puDst;
9782
9783 if (uShift <= 15)
9784 {
9785 puDst->au16[0] = uSrc1.au16[0] << uShift;
9786 puDst->au16[1] = uSrc1.au16[1] << uShift;
9787 puDst->au16[2] = uSrc1.au16[2] << uShift;
9788 puDst->au16[3] = uSrc1.au16[3] << uShift;
9789 puDst->au16[4] = uSrc1.au16[4] << uShift;
9790 puDst->au16[5] = uSrc1.au16[5] << uShift;
9791 puDst->au16[6] = uSrc1.au16[6] << uShift;
9792 puDst->au16[7] = uSrc1.au16[7] << uShift;
9793 }
9794 else
9795 {
9796 puDst->au64[0] = 0;
9797 puDst->au64[1] = 0;
9798 }
9799}
9800
9801#endif
9802
9803
9804/*
9805 * PSRLD / VPSRLD
9806 */
9807#ifdef IEM_WITHOUT_ASSEMBLY
9808
9809IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9810{
9811 RTUINT64U uSrc1 = { *puDst };
9812 RTUINT64U uSrc2 = { *puSrc };
9813 RTUINT64U uDst;
9814
9815 if (uSrc2.au64[0] <= 31)
9816 {
9817 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9818 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9819 }
9820 else
9821 {
9822 uDst.au64[0] = 0;
9823 }
9824 *puDst = uDst.u;
9825}
9826
9827
9828IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9829{
9830 RTUINT64U uSrc1 = { *puDst };
9831 RTUINT64U uDst;
9832
9833 if (uShift <= 31)
9834 {
9835 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9836 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9837 }
9838 else
9839 {
9840 uDst.au64[0] = 0;
9841 }
9842 *puDst = uDst.u;
9843}
9844
9845
9846IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9847{
9848 RTUINT128U uSrc1 = *puDst;
9849
9850 if (puSrc->au64[0] <= 31)
9851 {
9852 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9853 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9854 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9855 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9856 }
9857 else
9858 {
9859 puDst->au64[0] = 0;
9860 puDst->au64[1] = 0;
9861 }
9862}
9863
9864IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9865{
9866 RTUINT128U uSrc1 = *puDst;
9867
9868 if (uShift <= 31)
9869 {
9870 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9871 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9872 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9873 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9874 }
9875 else
9876 {
9877 puDst->au64[0] = 0;
9878 puDst->au64[1] = 0;
9879 }
9880}
9881
9882#endif
9883
9884
9885/*
9886 * PSRAD / VPSRAD
9887 */
9888#ifdef IEM_WITHOUT_ASSEMBLY
9889
9890IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9891{
9892 RTUINT64U uSrc1 = { *puDst };
9893 RTUINT64U uSrc2 = { *puSrc };
9894 RTUINT64U uDst;
9895
9896 if (uSrc2.au64[0] <= 31)
9897 {
9898 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9899 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9900 }
9901 else
9902 {
9903 uDst.au64[0] = 0;
9904 }
9905 *puDst = uDst.u;
9906}
9907
9908
9909IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9910{
9911 RTUINT64U uSrc1 = { *puDst };
9912 RTUINT64U uDst;
9913
9914 if (uShift <= 31)
9915 {
9916 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9917 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9918 }
9919 else
9920 {
9921 uDst.au64[0] = 0;
9922 }
9923 *puDst = uDst.u;
9924}
9925
9926
9927IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9928{
9929 RTUINT128U uSrc1 = *puDst;
9930
9931 if (puSrc->au64[0] <= 31)
9932 {
9933 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9934 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9935 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9936 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9937 }
9938 else
9939 {
9940 puDst->au64[0] = 0;
9941 puDst->au64[1] = 0;
9942 }
9943}
9944
9945IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9946{
9947 RTUINT128U uSrc1 = *puDst;
9948
9949 if (uShift <= 31)
9950 {
9951 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9952 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9953 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9954 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9955 }
9956 else
9957 {
9958 puDst->au64[0] = 0;
9959 puDst->au64[1] = 0;
9960 }
9961}
9962
9963#endif
9964
9965
9966/*
9967 * PSLLD / VPSLLD
9968 */
9969#ifdef IEM_WITHOUT_ASSEMBLY
9970
9971IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9972{
9973 RTUINT64U uSrc1 = { *puDst };
9974 RTUINT64U uSrc2 = { *puSrc };
9975 RTUINT64U uDst;
9976
9977 if (uSrc2.au64[0] <= 31)
9978 {
9979 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9980 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9981 }
9982 else
9983 {
9984 uDst.au64[0] = 0;
9985 }
9986 *puDst = uDst.u;
9987}
9988
9989
9990IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9991{
9992 RTUINT64U uSrc1 = { *puDst };
9993 RTUINT64U uDst;
9994
9995 if (uShift <= 31)
9996 {
9997 uDst.au32[0] = uSrc1.au32[0] << uShift;
9998 uDst.au32[1] = uSrc1.au32[1] << uShift;
9999 }
10000 else
10001 {
10002 uDst.au64[0] = 0;
10003 }
10004 *puDst = uDst.u;
10005}
10006
10007
10008IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10009{
10010 RTUINT128U uSrc1 = *puDst;
10011
10012 if (puSrc->au64[0] <= 31)
10013 {
10014 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10015 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10016 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10017 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10018 }
10019 else
10020 {
10021 puDst->au64[0] = 0;
10022 puDst->au64[1] = 0;
10023 }
10024}
10025
10026IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10027{
10028 RTUINT128U uSrc1 = *puDst;
10029
10030 if (uShift <= 31)
10031 {
10032 puDst->au32[0] = uSrc1.au32[0] << uShift;
10033 puDst->au32[1] = uSrc1.au32[1] << uShift;
10034 puDst->au32[2] = uSrc1.au32[2] << uShift;
10035 puDst->au32[3] = uSrc1.au32[3] << uShift;
10036 }
10037 else
10038 {
10039 puDst->au64[0] = 0;
10040 puDst->au64[1] = 0;
10041 }
10042}
10043
10044#endif
10045
10046
10047/*
10048 * PSRLQ / VPSRLQ
10049 */
10050#ifdef IEM_WITHOUT_ASSEMBLY
10051
10052IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10053{
10054 RTUINT64U uSrc1 = { *puDst };
10055 RTUINT64U uSrc2 = { *puSrc };
10056 RTUINT64U uDst;
10057
10058 if (uSrc2.au64[0] <= 63)
10059 {
10060 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10061 }
10062 else
10063 {
10064 uDst.au64[0] = 0;
10065 }
10066 *puDst = uDst.u;
10067}
10068
10069
10070IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10071{
10072 RTUINT64U uSrc1 = { *puDst };
10073 RTUINT64U uDst;
10074
10075 if (uShift <= 63)
10076 {
10077 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10078 }
10079 else
10080 {
10081 uDst.au64[0] = 0;
10082 }
10083 *puDst = uDst.u;
10084}
10085
10086
10087IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10088{
10089 RTUINT128U uSrc1 = *puDst;
10090
10091 if (puSrc->au64[0] <= 63)
10092 {
10093 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10094 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10095 }
10096 else
10097 {
10098 puDst->au64[0] = 0;
10099 puDst->au64[1] = 0;
10100 }
10101}
10102
10103IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10104{
10105 RTUINT128U uSrc1 = *puDst;
10106
10107 if (uShift <= 63)
10108 {
10109 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10110 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10111 }
10112 else
10113 {
10114 puDst->au64[0] = 0;
10115 puDst->au64[1] = 0;
10116 }
10117}
10118
10119#endif
10120
10121
10122/*
10123 * PSLLQ / VPSLLQ
10124 */
10125#ifdef IEM_WITHOUT_ASSEMBLY
10126
10127IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10128{
10129 RTUINT64U uSrc1 = { *puDst };
10130 RTUINT64U uSrc2 = { *puSrc };
10131 RTUINT64U uDst;
10132
10133 if (uSrc2.au64[0] <= 63)
10134 {
10135 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10136 }
10137 else
10138 {
10139 uDst.au64[0] = 0;
10140 }
10141 *puDst = uDst.u;
10142}
10143
10144
10145IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10146{
10147 RTUINT64U uSrc1 = { *puDst };
10148 RTUINT64U uDst;
10149
10150 if (uShift <= 63)
10151 {
10152 uDst.au64[0] = uSrc1.au64[0] << uShift;
10153 }
10154 else
10155 {
10156 uDst.au64[0] = 0;
10157 }
10158 *puDst = uDst.u;
10159}
10160
10161
10162IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10163{
10164 RTUINT128U uSrc1 = *puDst;
10165
10166 if (puSrc->au64[0] <= 63)
10167 {
10168 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10169 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10170 }
10171 else
10172 {
10173 puDst->au64[0] = 0;
10174 puDst->au64[1] = 0;
10175 }
10176}
10177
10178IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10179{
10180 RTUINT128U uSrc1 = *puDst;
10181
10182 if (uShift <= 63)
10183 {
10184 puDst->au64[0] = uSrc1.au64[0] << uShift;
10185 puDst->au64[1] = uSrc1.au64[1] << uShift;
10186 }
10187 else
10188 {
10189 puDst->au64[0] = 0;
10190 puDst->au64[1] = 0;
10191 }
10192}
10193
10194#endif
10195
10196
10197/*
10198 * PSRLDQ / VPSRLDQ
10199 */
10200#ifdef IEM_WITHOUT_ASSEMBLY
10201
10202IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10203{
10204 RTUINT128U uSrc1 = *puDst;
10205
10206 if (uShift < 16)
10207 {
10208 int i;
10209
10210 for (i = 0; i < 16 - uShift; ++i)
10211 puDst->au8[i] = uSrc1.au8[i + uShift];
10212 for (i = 16 - uShift; i < 16; ++i)
10213 puDst->au8[i] = 0;
10214 }
10215 else
10216 {
10217 puDst->au64[0] = 0;
10218 puDst->au64[1] = 0;
10219 }
10220}
10221
10222#endif
10223
10224
10225/*
10226 * PSLLDQ / VPSLLDQ
10227 */
10228#ifdef IEM_WITHOUT_ASSEMBLY
10229
10230IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10231{
10232 RTUINT128U uSrc1 = *puDst;
10233
10234 if (uShift < 16)
10235 {
10236 int i;
10237
10238 for (i = 0; i < uShift; ++i)
10239 puDst->au8[i] = 0;
10240 for (i = uShift; i < 16; ++i)
10241 puDst->au8[i] = uSrc1.au8[i - uShift];
10242 }
10243 else
10244 {
10245 puDst->au64[0] = 0;
10246 puDst->au64[1] = 0;
10247 }
10248}
10249
10250#endif
10251
10252
10253/*
10254 * PMADDWD / VPMADDWD
10255 */
10256#ifdef IEM_WITHOUT_ASSEMBLY
10257
10258IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10259{
10260 RTUINT64U uSrc1 = { *puDst };
10261 RTUINT64U uSrc2 = { *puSrc };
10262 RTUINT64U uDst;
10263
10264 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10265 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10266 *puDst = uDst.u;
10267 RT_NOREF(pFpuState);
10268}
10269
10270
10271IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10272{
10273 RTUINT128U uSrc1 = *puDst;
10274
10275 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10276 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10277 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10278 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10279 RT_NOREF(pFpuState);
10280}
10281
10282#endif
10283
10284
10285/*
10286 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10287 */
10288#ifdef IEM_WITHOUT_ASSEMBLY
10289
10290IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10291{
10292 RTUINT64U uSrc1 = { *puDst };
10293 RTUINT64U uSrc2 = { *puSrc };
10294 RTUINT64U uDst;
10295
10296 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10297 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10298 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10299 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10300 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10301 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10302 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10303 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10304 *puDst = uDst.u;
10305 RT_NOREF(pFpuState);
10306}
10307
10308
10309IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10310{
10311 RTUINT128U uSrc1 = *puDst;
10312
10313 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10314 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10315 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10316 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10317 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10318 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10319 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10320 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10321 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10322 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10323 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10324 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10325 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10326 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10327 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10328 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10329 RT_NOREF(pFpuState);
10330}
10331
10332#endif
10333
10334
10335IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10336{
10337 RTUINT128U uSrc1 = *puDst;
10338
10339 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10340 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10341 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10342 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10343 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10344 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10345 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10346 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10347 RT_NOREF(pFpuState);
10348}
10349
10350
10351IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10352{
10353 RTUINT128U uSrc1 = *puDst;
10354
10355 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10356 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10357 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10358 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10359 RT_NOREF(pFpuState);
10360}
10361
10362
10363IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10364 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10365{
10366 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10367 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10368 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10369 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10370 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10371 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10372 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10373 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10374 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10375 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10376 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10377 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10378 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10379 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10380 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10381 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10382 RT_NOREF(pExtState);
10383}
10384
10385
10386IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10387 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10388{
10389 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10390 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10391 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10392 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10393 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10394 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10395 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10396 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10397 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10398 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10399 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10400 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10401 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10402 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10403 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10404 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10405 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10406 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10407 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10408 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10409 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10410 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10411 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10412 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10413 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10414 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10415 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10416 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10417 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10418 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10419 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10420 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10421 RT_NOREF(pExtState);
10422}
10423
10424
10425IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10426 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10427{
10428 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10429 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10430 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10431 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10432 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10433 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10434 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10435 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10436 RT_NOREF(pExtState);
10437}
10438
10439
10440IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10441 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10442{
10443 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10444 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10445 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10446 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10447 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10448 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10449 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10450 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10451 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10452 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10453 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10454 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10455 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10456 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10457 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10458 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10459 RT_NOREF(pExtState);
10460}
10461
10462
10463IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10464 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10465{
10466 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10467 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10468 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10469 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10470 RT_NOREF(pExtState);
10471}
10472
10473
10474IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10475 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10476{
10477 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10478 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10479 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10480 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10481 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10482 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10483 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10484 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10485 RT_NOREF(pExtState);
10486}
10487
10488
10489/*
10490 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10491 */
10492#ifdef IEM_WITHOUT_ASSEMBLY
10493
10494IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10495{
10496 RTUINT64U uSrc1 = { *puDst };
10497 RTUINT64U uSrc2 = { *puSrc };
10498 RTUINT64U uDst;
10499
10500 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10501 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10502 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10503 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10504 *puDst = uDst.u;
10505 RT_NOREF(pFpuState);
10506}
10507
10508
10509IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10510{
10511 RTUINT128U uSrc1 = *puDst;
10512
10513 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10514 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10515 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10516 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10517 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10518 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10519 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10520 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10521 RT_NOREF(pFpuState);
10522}
10523
10524#endif
10525
10526IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10527{
10528 RTUINT128U uSrc1 = *puDst;
10529
10530 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10531 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10532 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10533 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10534 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10535 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10536 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10537 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10538 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10539 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10540 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10541 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10542 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10543 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10544 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10545 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10546 RT_NOREF(pFpuState);
10547}
10548
10549
10550IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10551{
10552 RTUINT128U uSrc1 = *puDst;
10553
10554 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10555 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10556 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10557 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10558 RT_NOREF(pFpuState);
10559}
10560
10561
10562IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10563 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10564{
10565 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10566 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10567 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10568 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10569 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10570 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10571 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10572 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10573 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10574 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10575 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10576 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10577 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10578 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10579 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10580 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10581 RT_NOREF(pExtState);
10582}
10583
10584
10585IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10586 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10587{
10588 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10589 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10590 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10591 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10592 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10593 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10594 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10595 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10596 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10597 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10598 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10599 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10600 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10601 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10602 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10603 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10604 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10605 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10606 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10607 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10608 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10609 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10610 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10611 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10612 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10613 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10614 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10615 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10616 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10617 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10618 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10619 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10620 RT_NOREF(pExtState);
10621}
10622
10623
10624IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10625 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10626{
10627 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10628 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10629 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10630 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10631 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10632 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10633 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10634 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10635 RT_NOREF(pExtState);
10636}
10637
10638
10639IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10640 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10641{
10642 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10643 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10644 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10645 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10646 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10647 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10648 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10649 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10650 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10651 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10652 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10653 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10654 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10655 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10656 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10657 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10658 RT_NOREF(pExtState);
10659}
10660
10661
10662IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10663 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10664{
10665 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10666 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10667 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10668 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10669 RT_NOREF(pExtState);
10670}
10671
10672
10673IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10674 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10675{
10676 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10677 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10678 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10679 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10680 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10681 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10682 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10683 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10684 RT_NOREF(pExtState);
10685}
10686
10687
10688/*
10689 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10690 */
10691#ifdef IEM_WITHOUT_ASSEMBLY
10692
10693IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10694{
10695 RTUINT64U uSrc1 = { *puDst };
10696 RTUINT64U uSrc2 = { *puSrc };
10697 RTUINT64U uDst;
10698
10699 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10700 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10701 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10702 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10703 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10704 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10705 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10706 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10707 *puDst = uDst.u;
10708 RT_NOREF(pFpuState);
10709}
10710
10711
10712IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10713{
10714 RTUINT128U uSrc1 = *puDst;
10715
10716 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10717 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10718 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10719 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10720 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10721 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10722 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10723 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10724 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10725 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10726 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10727 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10728 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10729 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10730 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10731 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10732 RT_NOREF(pFpuState);
10733}
10734
10735#endif
10736
10737IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10738{
10739 RTUINT128U uSrc1 = *puDst;
10740
10741 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10742 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10743 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10744 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10745 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10746 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10747 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10748 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10749 RT_NOREF(pFpuState);
10750}
10751
10752
10753IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10754{
10755 RTUINT128U uSrc1 = *puDst;
10756
10757 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10758 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10759 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10760 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10761 RT_NOREF(pFpuState);
10762}
10763
10764
10765IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10766 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10767{
10768 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10769 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10770 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10771 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10772 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10773 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10774 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10775 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10776 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10777 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10778 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10779 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10780 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10781 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10782 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10783 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10784 RT_NOREF(pExtState);
10785}
10786
10787
10788IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10789 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10790{
10791 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10792 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10793 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10794 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10795 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10796 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10797 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10798 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10799 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10800 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10801 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10802 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10803 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10804 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10805 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10806 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10807 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10808 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10809 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10810 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10811 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10812 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10813 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10814 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10815 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10816 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10817 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10818 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10819 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10820 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10821 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10822 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10823 RT_NOREF(pExtState);
10824}
10825
10826
10827IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10828 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10829{
10830 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10831 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10832 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10833 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10834 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10835 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10836 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10837 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10838 RT_NOREF(pExtState);
10839}
10840
10841
10842IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10843 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10844{
10845 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10846 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10847 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10848 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10849 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10850 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10851 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10852 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10853 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10854 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10855 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10856 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10857 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10858 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10859 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10860 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10861 RT_NOREF(pExtState);
10862}
10863
10864
10865IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10866 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10867{
10868 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10869 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10870 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10871 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10872 RT_NOREF(pExtState);
10873}
10874
10875
10876IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10877 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10878{
10879 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10880 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10881 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10882 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10883 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10884 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10885 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10886 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10887 RT_NOREF(pExtState);
10888}
10889
10890
10891/*
10892 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10893 */
10894#ifdef IEM_WITHOUT_ASSEMBLY
10895
10896IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10897{
10898 RTUINT64U uSrc1 = { *puDst };
10899 RTUINT64U uSrc2 = { *puSrc };
10900 RTUINT64U uDst;
10901
10902 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10903 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10904 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10905 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10906 *puDst = uDst.u;
10907 RT_NOREF(pFpuState);
10908}
10909
10910
10911IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10912{
10913 RTUINT128U uSrc1 = *puDst;
10914
10915 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10916 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10917 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10918 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10919 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10920 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10921 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10922 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10923 RT_NOREF(pFpuState);
10924}
10925
10926#endif
10927
10928IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10929{
10930 RTUINT128U uSrc1 = *puDst;
10931
10932 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10933 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10934 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10935 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10936 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10937 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10938 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10939 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10940 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10941 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10942 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10943 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10944 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10945 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10946 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10947 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10948 RT_NOREF(pFpuState);
10949}
10950
10951
10952IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10953{
10954 RTUINT128U uSrc1 = *puDst;
10955
10956 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10957 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10958 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10959 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10960 RT_NOREF(pFpuState);
10961}
10962
10963
10964IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10965 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10966{
10967 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10968 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10969 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10970 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10971 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10972 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10973 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10974 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10975 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10976 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10977 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10978 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10979 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10980 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10981 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10982 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10983 RT_NOREF(pExtState);
10984}
10985
10986
10987IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10988 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10989{
10990 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10991 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10992 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10993 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10994 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10995 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10996 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10997 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10998 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10999 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11000 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11001 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11002 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11003 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11004 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11005 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11006 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11007 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11008 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11009 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11010 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11011 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11012 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11013 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11014 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11015 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11016 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11017 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11018 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11019 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11020 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11021 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11022 RT_NOREF(pExtState);
11023}
11024
11025
11026IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11027 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11028{
11029 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11030 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11031 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11032 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11033 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11034 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11035 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11036 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11037 RT_NOREF(pExtState);
11038}
11039
11040
11041IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11042 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11043{
11044 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11045 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11046 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11047 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11048 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11049 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11050 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11051 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11052 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11053 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11054 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
11055 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
11056 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
11057 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
11058 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
11059 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
11060 RT_NOREF(pExtState);
11061}
11062
11063
11064IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11065 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11066{
11067 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11068 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11069 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11070 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11071 RT_NOREF(pExtState);
11072}
11073
11074
11075IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11076 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11077{
11078 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11079 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11080 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11081 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11082 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11083 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11084 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11085 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11086 RT_NOREF(pExtState);
11087}
11088
11089
11090/*
11091 * PAVGB / VPAVGB / PAVGW / VPAVGW
11092 */
11093#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11094#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11095
11096#ifdef IEM_WITHOUT_ASSEMBLY
11097
11098IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11099{
11100 RTUINT64U uSrc1 = { *puDst };
11101 RTUINT64U uSrc2 = { *puSrc };
11102 RTUINT64U uDst;
11103
11104 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11105 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11106 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11107 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11108 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11109 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11110 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11111 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11112 *puDst = uDst.u;
11113}
11114
11115
11116IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11117{
11118 RTUINT128U uSrc1 = *puDst;
11119
11120 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11121 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11122 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11123 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11124 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11125 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11126 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11127 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11128 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11129 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11130 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11131 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11132 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11133 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11134 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11135 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11136}
11137
11138
11139IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11140{
11141 RTUINT64U uSrc1 = { *puDst };
11142 RTUINT64U uSrc2 = { *puSrc };
11143 RTUINT64U uDst;
11144
11145 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11146 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11147 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11148 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11149 *puDst = uDst.u;
11150}
11151
11152
11153IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11154{
11155 RTUINT128U uSrc1 = *puDst;
11156
11157 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11158 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11159 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11160 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11161 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11162 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11163 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11164 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11165}
11166
11167#endif
11168
11169IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11170{
11171 RTUINT128U uSrc1 = *puDst;
11172
11173 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11174 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11175 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11176 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11177 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11178 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11179 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11180 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11181 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11182 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11183 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11184 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11185 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11186 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11187 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11188 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11189}
11190
11191
11192IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11193{
11194 RTUINT128U uSrc1 = *puDst;
11195
11196 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11197 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11198 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11199 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11200 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11201 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11202 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11203 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11204 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11205 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11206 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11207 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11208 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11209 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11210 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11211 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11212}
11213
11214
11215IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11216{
11217 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11218 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11219 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11220 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11221 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11222 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11223 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11224 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11225 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11226 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11227 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11228 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11229 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11230 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11231 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11232 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11233}
11234
11235
11236IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11237{
11238 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11239 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11240 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11241 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11242 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11243 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11244 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11245 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11246 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11247 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11248 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11249 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11250 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11251 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11252 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11253 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11254 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11255 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11256 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11257 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11258 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11259 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11260 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11261 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11262 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11263 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11264 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11265 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11266 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11267 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11268 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11269 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11270}
11271
11272
11273IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11274{
11275 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11276 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11277 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11278 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11279 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11280 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11281 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11282 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11283}
11284
11285
11286IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11287{
11288 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11289 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11290 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11291 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11292 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11293 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11294 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11295 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11296 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11297 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11298 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11299 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11300 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11301 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11302 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11303 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11304}
11305
11306#undef PAVGB_EXEC
11307#undef PAVGW_EXEC
11308
11309
11310/*
11311 * PMOVMSKB / VPMOVMSKB
11312 */
11313#ifdef IEM_WITHOUT_ASSEMBLY
11314
11315IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11316{
11317 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11318 uint64_t const uSrc = *pu64Src;
11319 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11320 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11321 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11322 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11323 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11324 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11325 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11326 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11327}
11328
11329
11330IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11331{
11332 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11333 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11334 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11335 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11336 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11337 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11338 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11339 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11340 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11341 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11342 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11343 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11344 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11345 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11346 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11347 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11348 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11349 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11350 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11351}
11352
11353#endif
11354
11355IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11356{
11357 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11358 uint64_t const uSrc0 = puSrc->QWords.qw0;
11359 uint64_t const uSrc1 = puSrc->QWords.qw1;
11360 uint64_t const uSrc2 = puSrc->QWords.qw2;
11361 uint64_t const uSrc3 = puSrc->QWords.qw3;
11362 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11363 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11364 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11365 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11366 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11367 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11368 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11369 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11370 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11371 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11372 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11373 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11374 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11375 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11376 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11377 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11378 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11379 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11380 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11381 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11382 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11383 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11384 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11385 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11386 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11387 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11388 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11389 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11390 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11391 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11392 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11393 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11394}
11395
11396
11397/*
11398 * [V]PSHUFB
11399 */
11400
11401IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11402{
11403 RTUINT64U const uSrc = { *puSrc };
11404 RTUINT64U const uDstIn = { *puDst };
11405 ASMCompilerBarrier();
11406 RTUINT64U uDstOut = { 0 };
11407 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11408 {
11409 uint8_t idxSrc = uSrc.au8[iByte];
11410 if (!(idxSrc & 0x80))
11411 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11412 }
11413 *puDst = uDstOut.u;
11414 RT_NOREF(pFpuState);
11415}
11416
11417
11418IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11419{
11420 RTUINT128U const uSrc = *puSrc;
11421 RTUINT128U const uDstIn = *puDst;
11422 ASMCompilerBarrier();
11423 puDst->au64[0] = 0;
11424 puDst->au64[1] = 0;
11425 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11426 {
11427 uint8_t idxSrc = uSrc.au8[iByte];
11428 if (!(idxSrc & 0x80))
11429 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11430 }
11431 RT_NOREF(pFpuState);
11432}
11433
11434
11435IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11436 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11437{
11438 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11439 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11440 ASMCompilerBarrier();
11441 puDst->au64[0] = 0;
11442 puDst->au64[1] = 0;
11443 for (unsigned iByte = 0; iByte < 16; iByte++)
11444 {
11445 uint8_t idxSrc = uSrc2.au8[iByte];
11446 if (!(idxSrc & 0x80))
11447 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11448 }
11449 RT_NOREF(pExtState);
11450}
11451
11452
11453IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11454 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11455{
11456 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11457 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11458 ASMCompilerBarrier();
11459 puDst->au64[0] = 0;
11460 puDst->au64[1] = 0;
11461 puDst->au64[2] = 0;
11462 puDst->au64[3] = 0;
11463 for (unsigned iByte = 0; iByte < 16; iByte++)
11464 {
11465 uint8_t idxSrc = uSrc2.au8[iByte];
11466 if (!(idxSrc & 0x80))
11467 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11468 }
11469 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11470 {
11471 uint8_t idxSrc = uSrc2.au8[iByte];
11472 if (!(idxSrc & 0x80))
11473 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11474 }
11475 RT_NOREF(pExtState);
11476}
11477
11478
11479/*
11480 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11481 */
11482#ifdef IEM_WITHOUT_ASSEMBLY
11483
11484IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11485{
11486 uint64_t const uSrc = *puSrc;
11487 ASMCompilerBarrier();
11488 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11489 uSrc >> (((bEvil >> 2) & 3) * 16),
11490 uSrc >> (((bEvil >> 4) & 3) * 16),
11491 uSrc >> (((bEvil >> 6) & 3) * 16));
11492}
11493
11494
11495IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11496{
11497 puDst->QWords.qw0 = puSrc->QWords.qw0;
11498 uint64_t const uSrc = puSrc->QWords.qw1;
11499 ASMCompilerBarrier();
11500 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11501 uSrc >> (((bEvil >> 2) & 3) * 16),
11502 uSrc >> (((bEvil >> 4) & 3) * 16),
11503 uSrc >> (((bEvil >> 6) & 3) * 16));
11504}
11505
11506#endif
11507
11508IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11509{
11510 puDst->QWords.qw0 = puSrc->QWords.qw0;
11511 uint64_t const uSrc1 = puSrc->QWords.qw1;
11512 puDst->QWords.qw2 = puSrc->QWords.qw2;
11513 uint64_t const uSrc3 = puSrc->QWords.qw3;
11514 ASMCompilerBarrier();
11515 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11516 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11517 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11518 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11519 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11520 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11521 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11522 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11523}
11524
11525#ifdef IEM_WITHOUT_ASSEMBLY
11526IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11527{
11528 puDst->QWords.qw1 = puSrc->QWords.qw1;
11529 uint64_t const uSrc = puSrc->QWords.qw0;
11530 ASMCompilerBarrier();
11531 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11532 uSrc >> (((bEvil >> 2) & 3) * 16),
11533 uSrc >> (((bEvil >> 4) & 3) * 16),
11534 uSrc >> (((bEvil >> 6) & 3) * 16));
11535
11536}
11537#endif
11538
11539
11540IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11541{
11542 puDst->QWords.qw3 = puSrc->QWords.qw3;
11543 uint64_t const uSrc2 = puSrc->QWords.qw2;
11544 puDst->QWords.qw1 = puSrc->QWords.qw1;
11545 uint64_t const uSrc0 = puSrc->QWords.qw0;
11546 ASMCompilerBarrier();
11547 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11548 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11549 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11550 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11551 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11552 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11553 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11554 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11555
11556}
11557
11558
11559#ifdef IEM_WITHOUT_ASSEMBLY
11560IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11561{
11562 RTUINT128U const uSrc = *puSrc;
11563 ASMCompilerBarrier();
11564 puDst->au32[0] = uSrc.au32[bEvil & 3];
11565 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11566 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11567 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11568}
11569#endif
11570
11571
11572IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11573{
11574 RTUINT256U const uSrc = *puSrc;
11575 ASMCompilerBarrier();
11576 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11577 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11578 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11579 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11580 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11581 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11582 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11583 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11584}
11585
11586
11587/*
11588 * PUNPCKHBW - high bytes -> words
11589 */
11590#ifdef IEM_WITHOUT_ASSEMBLY
11591
11592IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11593{
11594 RTUINT64U const uSrc2 = { *puSrc };
11595 RTUINT64U const uSrc1 = { *puDst };
11596 ASMCompilerBarrier();
11597 RTUINT64U uDstOut;
11598 uDstOut.au8[0] = uSrc1.au8[4];
11599 uDstOut.au8[1] = uSrc2.au8[4];
11600 uDstOut.au8[2] = uSrc1.au8[5];
11601 uDstOut.au8[3] = uSrc2.au8[5];
11602 uDstOut.au8[4] = uSrc1.au8[6];
11603 uDstOut.au8[5] = uSrc2.au8[6];
11604 uDstOut.au8[6] = uSrc1.au8[7];
11605 uDstOut.au8[7] = uSrc2.au8[7];
11606 *puDst = uDstOut.u;
11607}
11608
11609
11610IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11611{
11612 RTUINT128U const uSrc2 = *puSrc;
11613 RTUINT128U const uSrc1 = *puDst;
11614 ASMCompilerBarrier();
11615 RTUINT128U uDstOut;
11616 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11617 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11618 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11619 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11620 uDstOut.au8[ 4] = uSrc1.au8[10];
11621 uDstOut.au8[ 5] = uSrc2.au8[10];
11622 uDstOut.au8[ 6] = uSrc1.au8[11];
11623 uDstOut.au8[ 7] = uSrc2.au8[11];
11624 uDstOut.au8[ 8] = uSrc1.au8[12];
11625 uDstOut.au8[ 9] = uSrc2.au8[12];
11626 uDstOut.au8[10] = uSrc1.au8[13];
11627 uDstOut.au8[11] = uSrc2.au8[13];
11628 uDstOut.au8[12] = uSrc1.au8[14];
11629 uDstOut.au8[13] = uSrc2.au8[14];
11630 uDstOut.au8[14] = uSrc1.au8[15];
11631 uDstOut.au8[15] = uSrc2.au8[15];
11632 *puDst = uDstOut;
11633}
11634
11635#endif
11636
11637IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11638{
11639 RTUINT128U const uSrc2 = *puSrc2;
11640 RTUINT128U const uSrc1 = *puSrc1;
11641 ASMCompilerBarrier();
11642 RTUINT128U uDstOut;
11643 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11644 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11645 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11646 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11647 uDstOut.au8[ 4] = uSrc1.au8[10];
11648 uDstOut.au8[ 5] = uSrc2.au8[10];
11649 uDstOut.au8[ 6] = uSrc1.au8[11];
11650 uDstOut.au8[ 7] = uSrc2.au8[11];
11651 uDstOut.au8[ 8] = uSrc1.au8[12];
11652 uDstOut.au8[ 9] = uSrc2.au8[12];
11653 uDstOut.au8[10] = uSrc1.au8[13];
11654 uDstOut.au8[11] = uSrc2.au8[13];
11655 uDstOut.au8[12] = uSrc1.au8[14];
11656 uDstOut.au8[13] = uSrc2.au8[14];
11657 uDstOut.au8[14] = uSrc1.au8[15];
11658 uDstOut.au8[15] = uSrc2.au8[15];
11659 *puDst = uDstOut;
11660}
11661
11662
11663IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11664{
11665 RTUINT256U const uSrc2 = *puSrc2;
11666 RTUINT256U const uSrc1 = *puSrc1;
11667 ASMCompilerBarrier();
11668 RTUINT256U uDstOut;
11669 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11670 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11671 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11672 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11673 uDstOut.au8[ 4] = uSrc1.au8[10];
11674 uDstOut.au8[ 5] = uSrc2.au8[10];
11675 uDstOut.au8[ 6] = uSrc1.au8[11];
11676 uDstOut.au8[ 7] = uSrc2.au8[11];
11677 uDstOut.au8[ 8] = uSrc1.au8[12];
11678 uDstOut.au8[ 9] = uSrc2.au8[12];
11679 uDstOut.au8[10] = uSrc1.au8[13];
11680 uDstOut.au8[11] = uSrc2.au8[13];
11681 uDstOut.au8[12] = uSrc1.au8[14];
11682 uDstOut.au8[13] = uSrc2.au8[14];
11683 uDstOut.au8[14] = uSrc1.au8[15];
11684 uDstOut.au8[15] = uSrc2.au8[15];
11685 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11686 uDstOut.au8[16] = uSrc1.au8[24];
11687 uDstOut.au8[17] = uSrc2.au8[24];
11688 uDstOut.au8[18] = uSrc1.au8[25];
11689 uDstOut.au8[19] = uSrc2.au8[25];
11690 uDstOut.au8[20] = uSrc1.au8[26];
11691 uDstOut.au8[21] = uSrc2.au8[26];
11692 uDstOut.au8[22] = uSrc1.au8[27];
11693 uDstOut.au8[23] = uSrc2.au8[27];
11694 uDstOut.au8[24] = uSrc1.au8[28];
11695 uDstOut.au8[25] = uSrc2.au8[28];
11696 uDstOut.au8[26] = uSrc1.au8[29];
11697 uDstOut.au8[27] = uSrc2.au8[29];
11698 uDstOut.au8[28] = uSrc1.au8[30];
11699 uDstOut.au8[29] = uSrc2.au8[30];
11700 uDstOut.au8[30] = uSrc1.au8[31];
11701 uDstOut.au8[31] = uSrc2.au8[31];
11702 *puDst = uDstOut;
11703}
11704
11705
11706/*
11707 * PUNPCKHBW - high words -> dwords
11708 */
11709#ifdef IEM_WITHOUT_ASSEMBLY
11710
11711IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11712{
11713 RTUINT64U const uSrc2 = { *puSrc };
11714 RTUINT64U const uSrc1 = { *puDst };
11715 ASMCompilerBarrier();
11716 RTUINT64U uDstOut;
11717 uDstOut.au16[0] = uSrc1.au16[2];
11718 uDstOut.au16[1] = uSrc2.au16[2];
11719 uDstOut.au16[2] = uSrc1.au16[3];
11720 uDstOut.au16[3] = uSrc2.au16[3];
11721 *puDst = uDstOut.u;
11722}
11723
11724
11725IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11726{
11727 RTUINT128U const uSrc2 = *puSrc;
11728 RTUINT128U const uSrc1 = *puDst;
11729 ASMCompilerBarrier();
11730 RTUINT128U uDstOut;
11731 uDstOut.au16[0] = uSrc1.au16[4];
11732 uDstOut.au16[1] = uSrc2.au16[4];
11733 uDstOut.au16[2] = uSrc1.au16[5];
11734 uDstOut.au16[3] = uSrc2.au16[5];
11735 uDstOut.au16[4] = uSrc1.au16[6];
11736 uDstOut.au16[5] = uSrc2.au16[6];
11737 uDstOut.au16[6] = uSrc1.au16[7];
11738 uDstOut.au16[7] = uSrc2.au16[7];
11739 *puDst = uDstOut;
11740}
11741
11742#endif
11743
11744IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11745{
11746 RTUINT128U const uSrc2 = *puSrc2;
11747 RTUINT128U const uSrc1 = *puSrc1;
11748 ASMCompilerBarrier();
11749 RTUINT128U uDstOut;
11750 uDstOut.au16[0] = uSrc1.au16[4];
11751 uDstOut.au16[1] = uSrc2.au16[4];
11752 uDstOut.au16[2] = uSrc1.au16[5];
11753 uDstOut.au16[3] = uSrc2.au16[5];
11754 uDstOut.au16[4] = uSrc1.au16[6];
11755 uDstOut.au16[5] = uSrc2.au16[6];
11756 uDstOut.au16[6] = uSrc1.au16[7];
11757 uDstOut.au16[7] = uSrc2.au16[7];
11758 *puDst = uDstOut;
11759}
11760
11761
11762IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11763{
11764 RTUINT256U const uSrc2 = *puSrc2;
11765 RTUINT256U const uSrc1 = *puSrc1;
11766 ASMCompilerBarrier();
11767 RTUINT256U uDstOut;
11768 uDstOut.au16[0] = uSrc1.au16[4];
11769 uDstOut.au16[1] = uSrc2.au16[4];
11770 uDstOut.au16[2] = uSrc1.au16[5];
11771 uDstOut.au16[3] = uSrc2.au16[5];
11772 uDstOut.au16[4] = uSrc1.au16[6];
11773 uDstOut.au16[5] = uSrc2.au16[6];
11774 uDstOut.au16[6] = uSrc1.au16[7];
11775 uDstOut.au16[7] = uSrc2.au16[7];
11776
11777 uDstOut.au16[8] = uSrc1.au16[12];
11778 uDstOut.au16[9] = uSrc2.au16[12];
11779 uDstOut.au16[10] = uSrc1.au16[13];
11780 uDstOut.au16[11] = uSrc2.au16[13];
11781 uDstOut.au16[12] = uSrc1.au16[14];
11782 uDstOut.au16[13] = uSrc2.au16[14];
11783 uDstOut.au16[14] = uSrc1.au16[15];
11784 uDstOut.au16[15] = uSrc2.au16[15];
11785 *puDst = uDstOut;
11786}
11787
11788
11789/*
11790 * PUNPCKHBW - high dwords -> qword(s)
11791 */
11792#ifdef IEM_WITHOUT_ASSEMBLY
11793
11794IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11795{
11796 RTUINT64U const uSrc2 = { *puSrc };
11797 RTUINT64U const uSrc1 = { *puDst };
11798 ASMCompilerBarrier();
11799 RTUINT64U uDstOut;
11800 uDstOut.au32[0] = uSrc1.au32[1];
11801 uDstOut.au32[1] = uSrc2.au32[1];
11802 *puDst = uDstOut.u;
11803}
11804
11805
11806IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11807{
11808 RTUINT128U const uSrc2 = *puSrc;
11809 RTUINT128U const uSrc1 = *puDst;
11810 ASMCompilerBarrier();
11811 RTUINT128U uDstOut;
11812 uDstOut.au32[0] = uSrc1.au32[2];
11813 uDstOut.au32[1] = uSrc2.au32[2];
11814 uDstOut.au32[2] = uSrc1.au32[3];
11815 uDstOut.au32[3] = uSrc2.au32[3];
11816 *puDst = uDstOut;
11817}
11818
11819#endif
11820
11821IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11822{
11823 RTUINT128U const uSrc2 = *puSrc2;
11824 RTUINT128U const uSrc1 = *puSrc1;
11825 ASMCompilerBarrier();
11826 RTUINT128U uDstOut;
11827 uDstOut.au32[0] = uSrc1.au32[2];
11828 uDstOut.au32[1] = uSrc2.au32[2];
11829 uDstOut.au32[2] = uSrc1.au32[3];
11830 uDstOut.au32[3] = uSrc2.au32[3];
11831 *puDst = uDstOut;
11832}
11833
11834
11835IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11836{
11837 RTUINT256U const uSrc2 = *puSrc2;
11838 RTUINT256U const uSrc1 = *puSrc1;
11839 ASMCompilerBarrier();
11840 RTUINT256U uDstOut;
11841 uDstOut.au32[0] = uSrc1.au32[2];
11842 uDstOut.au32[1] = uSrc2.au32[2];
11843 uDstOut.au32[2] = uSrc1.au32[3];
11844 uDstOut.au32[3] = uSrc2.au32[3];
11845
11846 uDstOut.au32[4] = uSrc1.au32[6];
11847 uDstOut.au32[5] = uSrc2.au32[6];
11848 uDstOut.au32[6] = uSrc1.au32[7];
11849 uDstOut.au32[7] = uSrc2.au32[7];
11850 *puDst = uDstOut;
11851}
11852
11853
11854/*
11855 * PUNPCKHQDQ -> High qwords -> double qword(s).
11856 */
11857#ifdef IEM_WITHOUT_ASSEMBLY
11858IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11859{
11860 RTUINT128U const uSrc2 = *puSrc;
11861 RTUINT128U const uSrc1 = *puDst;
11862 ASMCompilerBarrier();
11863 RTUINT128U uDstOut;
11864 uDstOut.au64[0] = uSrc1.au64[1];
11865 uDstOut.au64[1] = uSrc2.au64[1];
11866 *puDst = uDstOut;
11867}
11868#endif
11869
11870
11871IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11872{
11873 RTUINT128U const uSrc2 = *puSrc2;
11874 RTUINT128U const uSrc1 = *puSrc1;
11875 ASMCompilerBarrier();
11876 RTUINT128U uDstOut;
11877 uDstOut.au64[0] = uSrc1.au64[1];
11878 uDstOut.au64[1] = uSrc2.au64[1];
11879 *puDst = uDstOut;
11880}
11881
11882
11883IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11884{
11885 RTUINT256U const uSrc2 = *puSrc2;
11886 RTUINT256U const uSrc1 = *puSrc1;
11887 ASMCompilerBarrier();
11888 RTUINT256U uDstOut;
11889 uDstOut.au64[0] = uSrc1.au64[1];
11890 uDstOut.au64[1] = uSrc2.au64[1];
11891
11892 uDstOut.au64[2] = uSrc1.au64[3];
11893 uDstOut.au64[3] = uSrc2.au64[3];
11894 *puDst = uDstOut;
11895}
11896
11897
11898/*
11899 * PUNPCKLBW - low bytes -> words
11900 */
11901#ifdef IEM_WITHOUT_ASSEMBLY
11902
11903IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11904{
11905 RTUINT64U const uSrc2 = { *puSrc };
11906 RTUINT64U const uSrc1 = { *puDst };
11907 ASMCompilerBarrier();
11908 RTUINT64U uDstOut;
11909 uDstOut.au8[0] = uSrc1.au8[0];
11910 uDstOut.au8[1] = uSrc2.au8[0];
11911 uDstOut.au8[2] = uSrc1.au8[1];
11912 uDstOut.au8[3] = uSrc2.au8[1];
11913 uDstOut.au8[4] = uSrc1.au8[2];
11914 uDstOut.au8[5] = uSrc2.au8[2];
11915 uDstOut.au8[6] = uSrc1.au8[3];
11916 uDstOut.au8[7] = uSrc2.au8[3];
11917 *puDst = uDstOut.u;
11918}
11919
11920
11921IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11922{
11923 RTUINT128U const uSrc2 = *puSrc;
11924 RTUINT128U const uSrc1 = *puDst;
11925 ASMCompilerBarrier();
11926 RTUINT128U uDstOut;
11927 uDstOut.au8[ 0] = uSrc1.au8[0];
11928 uDstOut.au8[ 1] = uSrc2.au8[0];
11929 uDstOut.au8[ 2] = uSrc1.au8[1];
11930 uDstOut.au8[ 3] = uSrc2.au8[1];
11931 uDstOut.au8[ 4] = uSrc1.au8[2];
11932 uDstOut.au8[ 5] = uSrc2.au8[2];
11933 uDstOut.au8[ 6] = uSrc1.au8[3];
11934 uDstOut.au8[ 7] = uSrc2.au8[3];
11935 uDstOut.au8[ 8] = uSrc1.au8[4];
11936 uDstOut.au8[ 9] = uSrc2.au8[4];
11937 uDstOut.au8[10] = uSrc1.au8[5];
11938 uDstOut.au8[11] = uSrc2.au8[5];
11939 uDstOut.au8[12] = uSrc1.au8[6];
11940 uDstOut.au8[13] = uSrc2.au8[6];
11941 uDstOut.au8[14] = uSrc1.au8[7];
11942 uDstOut.au8[15] = uSrc2.au8[7];
11943 *puDst = uDstOut;
11944}
11945
11946#endif
11947
11948IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11949{
11950 RTUINT128U const uSrc2 = *puSrc2;
11951 RTUINT128U const uSrc1 = *puSrc1;
11952 ASMCompilerBarrier();
11953 RTUINT128U uDstOut;
11954 uDstOut.au8[ 0] = uSrc1.au8[0];
11955 uDstOut.au8[ 1] = uSrc2.au8[0];
11956 uDstOut.au8[ 2] = uSrc1.au8[1];
11957 uDstOut.au8[ 3] = uSrc2.au8[1];
11958 uDstOut.au8[ 4] = uSrc1.au8[2];
11959 uDstOut.au8[ 5] = uSrc2.au8[2];
11960 uDstOut.au8[ 6] = uSrc1.au8[3];
11961 uDstOut.au8[ 7] = uSrc2.au8[3];
11962 uDstOut.au8[ 8] = uSrc1.au8[4];
11963 uDstOut.au8[ 9] = uSrc2.au8[4];
11964 uDstOut.au8[10] = uSrc1.au8[5];
11965 uDstOut.au8[11] = uSrc2.au8[5];
11966 uDstOut.au8[12] = uSrc1.au8[6];
11967 uDstOut.au8[13] = uSrc2.au8[6];
11968 uDstOut.au8[14] = uSrc1.au8[7];
11969 uDstOut.au8[15] = uSrc2.au8[7];
11970 *puDst = uDstOut;
11971}
11972
11973
11974IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11975{
11976 RTUINT256U const uSrc2 = *puSrc2;
11977 RTUINT256U const uSrc1 = *puSrc1;
11978 ASMCompilerBarrier();
11979 RTUINT256U uDstOut;
11980 uDstOut.au8[ 0] = uSrc1.au8[0];
11981 uDstOut.au8[ 1] = uSrc2.au8[0];
11982 uDstOut.au8[ 2] = uSrc1.au8[1];
11983 uDstOut.au8[ 3] = uSrc2.au8[1];
11984 uDstOut.au8[ 4] = uSrc1.au8[2];
11985 uDstOut.au8[ 5] = uSrc2.au8[2];
11986 uDstOut.au8[ 6] = uSrc1.au8[3];
11987 uDstOut.au8[ 7] = uSrc2.au8[3];
11988 uDstOut.au8[ 8] = uSrc1.au8[4];
11989 uDstOut.au8[ 9] = uSrc2.au8[4];
11990 uDstOut.au8[10] = uSrc1.au8[5];
11991 uDstOut.au8[11] = uSrc2.au8[5];
11992 uDstOut.au8[12] = uSrc1.au8[6];
11993 uDstOut.au8[13] = uSrc2.au8[6];
11994 uDstOut.au8[14] = uSrc1.au8[7];
11995 uDstOut.au8[15] = uSrc2.au8[7];
11996 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11997 uDstOut.au8[16] = uSrc1.au8[16];
11998 uDstOut.au8[17] = uSrc2.au8[16];
11999 uDstOut.au8[18] = uSrc1.au8[17];
12000 uDstOut.au8[19] = uSrc2.au8[17];
12001 uDstOut.au8[20] = uSrc1.au8[18];
12002 uDstOut.au8[21] = uSrc2.au8[18];
12003 uDstOut.au8[22] = uSrc1.au8[19];
12004 uDstOut.au8[23] = uSrc2.au8[19];
12005 uDstOut.au8[24] = uSrc1.au8[20];
12006 uDstOut.au8[25] = uSrc2.au8[20];
12007 uDstOut.au8[26] = uSrc1.au8[21];
12008 uDstOut.au8[27] = uSrc2.au8[21];
12009 uDstOut.au8[28] = uSrc1.au8[22];
12010 uDstOut.au8[29] = uSrc2.au8[22];
12011 uDstOut.au8[30] = uSrc1.au8[23];
12012 uDstOut.au8[31] = uSrc2.au8[23];
12013 *puDst = uDstOut;
12014}
12015
12016
12017/*
12018 * PUNPCKLBW - low words -> dwords
12019 */
12020#ifdef IEM_WITHOUT_ASSEMBLY
12021
12022IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12023{
12024 RTUINT64U const uSrc2 = { *puSrc };
12025 RTUINT64U const uSrc1 = { *puDst };
12026 ASMCompilerBarrier();
12027 RTUINT64U uDstOut;
12028 uDstOut.au16[0] = uSrc1.au16[0];
12029 uDstOut.au16[1] = uSrc2.au16[0];
12030 uDstOut.au16[2] = uSrc1.au16[1];
12031 uDstOut.au16[3] = uSrc2.au16[1];
12032 *puDst = uDstOut.u;
12033}
12034
12035
12036IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12037{
12038 RTUINT128U const uSrc2 = *puSrc;
12039 RTUINT128U const uSrc1 = *puDst;
12040 ASMCompilerBarrier();
12041 RTUINT128U uDstOut;
12042 uDstOut.au16[0] = uSrc1.au16[0];
12043 uDstOut.au16[1] = uSrc2.au16[0];
12044 uDstOut.au16[2] = uSrc1.au16[1];
12045 uDstOut.au16[3] = uSrc2.au16[1];
12046 uDstOut.au16[4] = uSrc1.au16[2];
12047 uDstOut.au16[5] = uSrc2.au16[2];
12048 uDstOut.au16[6] = uSrc1.au16[3];
12049 uDstOut.au16[7] = uSrc2.au16[3];
12050 *puDst = uDstOut;
12051}
12052
12053#endif
12054
12055IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12056{
12057 RTUINT128U const uSrc2 = *puSrc2;
12058 RTUINT128U const uSrc1 = *puSrc1;
12059 ASMCompilerBarrier();
12060 RTUINT128U uDstOut;
12061 uDstOut.au16[0] = uSrc1.au16[0];
12062 uDstOut.au16[1] = uSrc2.au16[0];
12063 uDstOut.au16[2] = uSrc1.au16[1];
12064 uDstOut.au16[3] = uSrc2.au16[1];
12065 uDstOut.au16[4] = uSrc1.au16[2];
12066 uDstOut.au16[5] = uSrc2.au16[2];
12067 uDstOut.au16[6] = uSrc1.au16[3];
12068 uDstOut.au16[7] = uSrc2.au16[3];
12069 *puDst = uDstOut;
12070}
12071
12072
12073IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12074{
12075 RTUINT256U const uSrc2 = *puSrc2;
12076 RTUINT256U const uSrc1 = *puSrc1;
12077 ASMCompilerBarrier();
12078 RTUINT256U uDstOut;
12079 uDstOut.au16[0] = uSrc1.au16[0];
12080 uDstOut.au16[1] = uSrc2.au16[0];
12081 uDstOut.au16[2] = uSrc1.au16[1];
12082 uDstOut.au16[3] = uSrc2.au16[1];
12083 uDstOut.au16[4] = uSrc1.au16[2];
12084 uDstOut.au16[5] = uSrc2.au16[2];
12085 uDstOut.au16[6] = uSrc1.au16[3];
12086 uDstOut.au16[7] = uSrc2.au16[3];
12087
12088 uDstOut.au16[8] = uSrc1.au16[8];
12089 uDstOut.au16[9] = uSrc2.au16[8];
12090 uDstOut.au16[10] = uSrc1.au16[9];
12091 uDstOut.au16[11] = uSrc2.au16[9];
12092 uDstOut.au16[12] = uSrc1.au16[10];
12093 uDstOut.au16[13] = uSrc2.au16[10];
12094 uDstOut.au16[14] = uSrc1.au16[11];
12095 uDstOut.au16[15] = uSrc2.au16[11];
12096 *puDst = uDstOut;
12097}
12098
12099
12100/*
12101 * PUNPCKLBW - low dwords -> qword(s)
12102 */
12103#ifdef IEM_WITHOUT_ASSEMBLY
12104
12105IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12106{
12107 RTUINT64U const uSrc2 = { *puSrc };
12108 RTUINT64U const uSrc1 = { *puDst };
12109 ASMCompilerBarrier();
12110 RTUINT64U uDstOut;
12111 uDstOut.au32[0] = uSrc1.au32[0];
12112 uDstOut.au32[1] = uSrc2.au32[0];
12113 *puDst = uDstOut.u;
12114}
12115
12116
12117IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12118{
12119 RTUINT128U const uSrc2 = *puSrc;
12120 RTUINT128U const uSrc1 = *puDst;
12121 ASMCompilerBarrier();
12122 RTUINT128U uDstOut;
12123 uDstOut.au32[0] = uSrc1.au32[0];
12124 uDstOut.au32[1] = uSrc2.au32[0];
12125 uDstOut.au32[2] = uSrc1.au32[1];
12126 uDstOut.au32[3] = uSrc2.au32[1];
12127 *puDst = uDstOut;
12128}
12129
12130#endif
12131
12132IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12133{
12134 RTUINT128U const uSrc2 = *puSrc2;
12135 RTUINT128U const uSrc1 = *puSrc1;
12136 ASMCompilerBarrier();
12137 RTUINT128U uDstOut;
12138 uDstOut.au32[0] = uSrc1.au32[0];
12139 uDstOut.au32[1] = uSrc2.au32[0];
12140 uDstOut.au32[2] = uSrc1.au32[1];
12141 uDstOut.au32[3] = uSrc2.au32[1];
12142 *puDst = uDstOut;
12143}
12144
12145
12146IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12147{
12148 RTUINT256U const uSrc2 = *puSrc2;
12149 RTUINT256U const uSrc1 = *puSrc1;
12150 ASMCompilerBarrier();
12151 RTUINT256U uDstOut;
12152 uDstOut.au32[0] = uSrc1.au32[0];
12153 uDstOut.au32[1] = uSrc2.au32[0];
12154 uDstOut.au32[2] = uSrc1.au32[1];
12155 uDstOut.au32[3] = uSrc2.au32[1];
12156
12157 uDstOut.au32[4] = uSrc1.au32[4];
12158 uDstOut.au32[5] = uSrc2.au32[4];
12159 uDstOut.au32[6] = uSrc1.au32[5];
12160 uDstOut.au32[7] = uSrc2.au32[5];
12161 *puDst = uDstOut;
12162}
12163
12164
12165/*
12166 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12167 */
12168#ifdef IEM_WITHOUT_ASSEMBLY
12169IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12170{
12171 RTUINT128U const uSrc2 = *puSrc;
12172 RTUINT128U const uSrc1 = *puDst;
12173 ASMCompilerBarrier();
12174 RTUINT128U uDstOut;
12175 uDstOut.au64[0] = uSrc1.au64[0];
12176 uDstOut.au64[1] = uSrc2.au64[0];
12177 *puDst = uDstOut;
12178}
12179#endif
12180
12181
12182IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12183{
12184 RTUINT128U const uSrc2 = *puSrc2;
12185 RTUINT128U const uSrc1 = *puSrc1;
12186 ASMCompilerBarrier();
12187 RTUINT128U uDstOut;
12188 uDstOut.au64[0] = uSrc1.au64[0];
12189 uDstOut.au64[1] = uSrc2.au64[0];
12190 *puDst = uDstOut;
12191}
12192
12193
12194IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12195{
12196 RTUINT256U const uSrc2 = *puSrc2;
12197 RTUINT256U const uSrc1 = *puSrc1;
12198 ASMCompilerBarrier();
12199 RTUINT256U uDstOut;
12200 uDstOut.au64[0] = uSrc1.au64[0];
12201 uDstOut.au64[1] = uSrc2.au64[0];
12202
12203 uDstOut.au64[2] = uSrc1.au64[2];
12204 uDstOut.au64[3] = uSrc2.au64[2];
12205 *puDst = uDstOut;
12206}
12207
12208
12209/*
12210 * PACKSSWB - signed words -> signed bytes
12211 */
12212
12213#ifdef IEM_WITHOUT_ASSEMBLY
12214
12215IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12216{
12217 RTUINT64U const uSrc2 = { *puSrc };
12218 RTUINT64U const uSrc1 = { *puDst };
12219 ASMCompilerBarrier();
12220 RTUINT64U uDstOut;
12221 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12222 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12223 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12224 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12225 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12226 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12227 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12228 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12229 *puDst = uDstOut.u;
12230}
12231
12232
12233IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12234{
12235 RTUINT128U const uSrc2 = *puSrc;
12236 RTUINT128U const uSrc1 = *puDst;
12237 ASMCompilerBarrier();
12238 RTUINT128U uDstOut;
12239 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12240 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12241 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12242 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12243 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12244 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12245 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12246 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12247 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12248 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12249 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12250 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12251 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12252 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12253 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12254 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12255 *puDst = uDstOut;
12256}
12257
12258#endif
12259
12260IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12261{
12262 RTUINT128U const uSrc2 = *puSrc2;
12263 RTUINT128U const uSrc1 = *puSrc1;
12264 ASMCompilerBarrier();
12265 RTUINT128U uDstOut;
12266 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12267 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12268 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12269 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12270 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12271 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12272 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12273 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12274 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12275 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12276 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12277 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12278 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12279 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12280 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12281 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12282 *puDst = uDstOut;
12283}
12284
12285
12286IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12287{
12288 RTUINT256U const uSrc2 = *puSrc2;
12289 RTUINT256U const uSrc1 = *puSrc1;
12290 ASMCompilerBarrier();
12291 RTUINT256U uDstOut;
12292 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12293 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12294 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12295 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12296 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12297 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12298 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12299 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12300 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12301 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12302 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12303 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12304 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12305 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12306 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12307 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12308
12309 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12310 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12311 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12312 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12313 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12314 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12315 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12316 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12317 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12318 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12319 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12320 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12321 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12322 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12323 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12324 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12325 *puDst = uDstOut;
12326}
12327
12328
12329/*
12330 * PACKUSWB - signed words -> unsigned bytes
12331 */
12332#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12333 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12334 ? (uint8_t)(a_iWord) \
12335 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12336
12337#ifdef IEM_WITHOUT_ASSEMBLY
12338
12339IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12340{
12341 RTUINT64U const uSrc2 = { *puSrc };
12342 RTUINT64U const uSrc1 = { *puDst };
12343 ASMCompilerBarrier();
12344 RTUINT64U uDstOut;
12345 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12346 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12347 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12348 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12349 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12350 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12351 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12352 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12353 *puDst = uDstOut.u;
12354}
12355
12356
12357IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12358{
12359 RTUINT128U const uSrc2 = *puSrc;
12360 RTUINT128U const uSrc1 = *puDst;
12361 ASMCompilerBarrier();
12362 RTUINT128U uDstOut;
12363 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12364 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12365 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12366 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12367 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12368 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12369 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12370 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12371 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12372 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12373 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12374 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12375 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12376 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12377 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12378 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12379 *puDst = uDstOut;
12380}
12381
12382#endif
12383
12384IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12385{
12386 RTUINT128U const uSrc2 = *puSrc2;
12387 RTUINT128U const uSrc1 = *puSrc1;
12388 ASMCompilerBarrier();
12389 RTUINT128U uDstOut;
12390 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12391 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12392 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12393 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12394 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12395 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12396 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12397 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12398 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12399 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12400 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12401 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12402 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12403 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12404 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12405 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12406 *puDst = uDstOut;
12407}
12408
12409
12410IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12411{
12412 RTUINT256U const uSrc2 = *puSrc2;
12413 RTUINT256U const uSrc1 = *puSrc1;
12414 ASMCompilerBarrier();
12415 RTUINT256U uDstOut;
12416 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12417 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12418 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12419 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12420 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12421 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12422 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12423 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12424 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12425 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12426 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12427 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12428 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12429 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12430 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12431 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12432
12433 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12434 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12435 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12436 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12437 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12438 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12439 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12440 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12441 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12442 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12443 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12444 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12445 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12446 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12447 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12448 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12449 *puDst = uDstOut;
12450}
12451
12452
12453/*
12454 * PACKSSDW - signed dwords -> signed words
12455 */
12456
12457#ifdef IEM_WITHOUT_ASSEMBLY
12458
12459IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12460{
12461 RTUINT64U const uSrc2 = { *puSrc };
12462 RTUINT64U const uSrc1 = { *puDst };
12463 ASMCompilerBarrier();
12464 RTUINT64U uDstOut;
12465 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12466 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12467 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12468 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12469 *puDst = uDstOut.u;
12470}
12471
12472
12473IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12474{
12475 RTUINT128U const uSrc2 = *puSrc;
12476 RTUINT128U const uSrc1 = *puDst;
12477 ASMCompilerBarrier();
12478 RTUINT128U uDstOut;
12479 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12480 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12481 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12482 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12483 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12484 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12485 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12486 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12487 *puDst = uDstOut;
12488}
12489
12490#endif
12491
12492IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12493{
12494 RTUINT128U const uSrc2 = *puSrc2;
12495 RTUINT128U const uSrc1 = *puSrc1;
12496 ASMCompilerBarrier();
12497 RTUINT128U uDstOut;
12498 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12499 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12500 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12501 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12502 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12503 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12504 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12505 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12506 *puDst = uDstOut;
12507}
12508
12509
12510IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12511{
12512 RTUINT256U const uSrc2 = *puSrc2;
12513 RTUINT256U const uSrc1 = *puSrc1;
12514 ASMCompilerBarrier();
12515 RTUINT256U uDstOut;
12516 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12517 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12518 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12519 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12520 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12521 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12522 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12523 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12524
12525 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12526 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12527 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12528 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12529 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12530 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12531 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12532 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12533 *puDst = uDstOut;
12534}
12535
12536
12537/*
12538 * PACKUSDW - signed dwords -> unsigned words
12539 */
12540#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12541 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12542 ? (uint16_t)(a_iDword) \
12543 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12544
12545#ifdef IEM_WITHOUT_ASSEMBLY
12546IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12547{
12548 RTUINT128U const uSrc2 = *puSrc;
12549 RTUINT128U const uSrc1 = *puDst;
12550 ASMCompilerBarrier();
12551 RTUINT128U uDstOut;
12552 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12553 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12554 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12555 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12556 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12557 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12558 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12559 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12560 *puDst = uDstOut;
12561}
12562#endif
12563
12564IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12565{
12566 RTUINT128U const uSrc2 = *puSrc2;
12567 RTUINT128U const uSrc1 = *puSrc1;
12568 ASMCompilerBarrier();
12569 RTUINT128U uDstOut;
12570 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12571 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12572 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12573 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12574 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12575 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12576 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12577 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12578 *puDst = uDstOut;
12579}
12580
12581
12582IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12583{
12584 RTUINT256U const uSrc2 = *puSrc2;
12585 RTUINT256U const uSrc1 = *puSrc1;
12586 ASMCompilerBarrier();
12587 RTUINT256U uDstOut;
12588 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12589 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12590 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12591 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12592 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12593 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12594 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12595 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12596
12597 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12598 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12599 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12600 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12601 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12602 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12603 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12604 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12605 *puDst = uDstOut;
12606}
12607
12608
12609/*
12610 * [V]PABSB / [V]PABSW / [V]PABSD
12611 */
12612
12613IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12614{
12615 RTUINT64U const uSrc = { *puSrc };
12616 RTUINT64U uDstOut = { 0 };
12617
12618 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12619 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12620 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12621 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12622 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12623 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12624 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12625 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12626 *puDst = uDstOut.u;
12627 RT_NOREF(pFpuState);
12628}
12629
12630
12631IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12632{
12633 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12634 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12635 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12636 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12637 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12638 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12639 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12640 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12641 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12642 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12643 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12644 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12645 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12646 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12647 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12648 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12649 RT_NOREF(pFpuState);
12650}
12651
12652
12653IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12654{
12655 RTUINT64U const uSrc = { *puSrc };
12656 RTUINT64U uDstOut = { 0 };
12657
12658 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12659 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12660 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12661 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12662 *puDst = uDstOut.u;
12663 RT_NOREF(pFpuState);
12664}
12665
12666
12667IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12668{
12669 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12670 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12671 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12672 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12673 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12674 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12675 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12676 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12677 RT_NOREF(pFpuState);
12678}
12679
12680
12681IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12682{
12683 RTUINT64U const uSrc = { *puSrc };
12684 RTUINT64U uDstOut = { 0 };
12685
12686 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12687 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12688 *puDst = uDstOut.u;
12689 RT_NOREF(pFpuState);
12690}
12691
12692
12693IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12694{
12695 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12696 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12697 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12698 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12699 RT_NOREF(pFpuState);
12700}
12701
12702
12703IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12704{
12705 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12706 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12707 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12708 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12709 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12710 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12711 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12712 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12713 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12714 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12715 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12716 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12717 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12718 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12719 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12720 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12721}
12722
12723
12724IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12725{
12726 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12727 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12728 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12729 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12730 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12731 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12732 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12733 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12734 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12735 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12736 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12737 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12738 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12739 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12740 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12741 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12742 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12743 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12744 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12745 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12746 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12747 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12748 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12749 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12750 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12751 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12752 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12753 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12754 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12755 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12756 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12757 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12758}
12759
12760
12761IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12762{
12763 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12764 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12765 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12766 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12767 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12768 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12769 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12770 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12771}
12772
12773
12774IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12775{
12776 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12777 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12778 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12779 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12780 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12781 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12782 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12783 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12784 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12785 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12786 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12787 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12788 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12789 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12790 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12791 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12792}
12793
12794
12795IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12796{
12797 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12798 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12799 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12800 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12801}
12802
12803
12804IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12805{
12806 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12807 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12808 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12809 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12810 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12811 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12812 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12813 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12814}
12815
12816
12817/*
12818 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12819 */
12820IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12821{
12822 RTUINT64U uSrc1 = { *puDst };
12823 RTUINT64U uSrc2 = { *puSrc };
12824 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12825
12826 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12827 {
12828 if (uSrc2.ai8[i] < 0)
12829 uDst.ai8[i] = -uSrc1.ai8[i];
12830 else if (uSrc2.ai8[i] == 0)
12831 uDst.ai8[i] = 0;
12832 else /* uSrc2.ai8[i] > 0 */
12833 uDst.ai8[i] = uSrc1.ai8[i];
12834 }
12835
12836 *puDst = uDst.u;
12837 RT_NOREF(pFpuState);
12838}
12839
12840
12841IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12842{
12843 RTUINT128U uSrc1 = *puDst;
12844
12845 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12846 {
12847 if (puSrc->ai8[i] < 0)
12848 puDst->ai8[i] = -uSrc1.ai8[i];
12849 else if (puSrc->ai8[i] == 0)
12850 puDst->ai8[i] = 0;
12851 else /* puSrc->ai8[i] > 0 */
12852 puDst->ai8[i] = uSrc1.ai8[i];
12853 }
12854
12855 RT_NOREF(pFpuState);
12856}
12857
12858
12859IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12860{
12861 RTUINT64U uSrc1 = { *puDst };
12862 RTUINT64U uSrc2 = { *puSrc };
12863 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12864
12865 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12866 {
12867 if (uSrc2.ai16[i] < 0)
12868 uDst.ai16[i] = -uSrc1.ai16[i];
12869 else if (uSrc2.ai16[i] == 0)
12870 uDst.ai16[i] = 0;
12871 else /* uSrc2.ai16[i] > 0 */
12872 uDst.ai16[i] = uSrc1.ai16[i];
12873 }
12874
12875 *puDst = uDst.u;
12876 RT_NOREF(pFpuState);
12877}
12878
12879
12880IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12881{
12882 RTUINT128U uSrc1 = *puDst;
12883
12884 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12885 {
12886 if (puSrc->ai16[i] < 0)
12887 puDst->ai16[i] = -uSrc1.ai16[i];
12888 else if (puSrc->ai16[i] == 0)
12889 puDst->ai16[i] = 0;
12890 else /* puSrc->ai16[i] > 0 */
12891 puDst->ai16[i] = uSrc1.ai16[i];
12892 }
12893
12894 RT_NOREF(pFpuState);
12895}
12896
12897
12898IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12899{
12900 RTUINT64U uSrc1 = { *puDst };
12901 RTUINT64U uSrc2 = { *puSrc };
12902 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12903
12904 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12905 {
12906 if (uSrc2.ai32[i] < 0)
12907 uDst.ai32[i] = -uSrc1.ai32[i];
12908 else if (uSrc2.ai32[i] == 0)
12909 uDst.ai32[i] = 0;
12910 else /* uSrc2.ai32[i] > 0 */
12911 uDst.ai32[i] = uSrc1.ai32[i];
12912 }
12913
12914 *puDst = uDst.u;
12915 RT_NOREF(pFpuState);
12916}
12917
12918
12919IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12920{
12921 RTUINT128U uSrc1 = *puDst;
12922
12923 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12924 {
12925 if (puSrc->ai32[i] < 0)
12926 puDst->ai32[i] = -uSrc1.ai32[i];
12927 else if (puSrc->ai32[i] == 0)
12928 puDst->ai32[i] = 0;
12929 else /* puSrc->ai32[i] > 0 */
12930 puDst->ai32[i] = uSrc1.ai32[i];
12931 }
12932
12933 RT_NOREF(pFpuState);
12934}
12935
12936
12937IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12938{
12939 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12940 {
12941 if (puSrc2->ai8[i] < 0)
12942 puDst->ai8[i] = -puSrc1->ai8[i];
12943 else if (puSrc2->ai8[i] == 0)
12944 puDst->ai8[i] = 0;
12945 else /* puSrc2->ai8[i] > 0 */
12946 puDst->ai8[i] = puSrc1->ai8[i];
12947 }
12948}
12949
12950
12951IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12952{
12953 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12954 {
12955 if (puSrc2->ai8[i] < 0)
12956 puDst->ai8[i] = -puSrc1->ai8[i];
12957 else if (puSrc2->ai8[i] == 0)
12958 puDst->ai8[i] = 0;
12959 else /* puSrc2->ai8[i] > 0 */
12960 puDst->ai8[i] = puSrc1->ai8[i];
12961 }
12962}
12963
12964
12965IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12966{
12967 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12968 {
12969 if (puSrc2->ai16[i] < 0)
12970 puDst->ai16[i] = -puSrc1->ai16[i];
12971 else if (puSrc2->ai16[i] == 0)
12972 puDst->ai16[i] = 0;
12973 else /* puSrc2->ai16[i] > 0 */
12974 puDst->ai16[i] = puSrc1->ai16[i];
12975 }
12976}
12977
12978
12979IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12980{
12981 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12982 {
12983 if (puSrc2->ai16[i] < 0)
12984 puDst->ai16[i] = -puSrc1->ai16[i];
12985 else if (puSrc2->ai16[i] == 0)
12986 puDst->ai16[i] = 0;
12987 else /* puSrc2->ai16[i] > 0 */
12988 puDst->ai16[i] = puSrc1->ai16[i];
12989 }
12990}
12991
12992
12993IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12994{
12995 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12996 {
12997 if (puSrc2->ai32[i] < 0)
12998 puDst->ai32[i] = -puSrc1->ai32[i];
12999 else if (puSrc2->ai32[i] == 0)
13000 puDst->ai32[i] = 0;
13001 else /* puSrc2->ai32[i] > 0 */
13002 puDst->ai32[i] = puSrc1->ai32[i];
13003 }
13004}
13005
13006
13007IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13008{
13009 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13010 {
13011 if (puSrc2->ai32[i] < 0)
13012 puDst->ai32[i] = -puSrc1->ai32[i];
13013 else if (puSrc2->ai32[i] == 0)
13014 puDst->ai32[i] = 0;
13015 else /* puSrc2->ai32[i] > 0 */
13016 puDst->ai32[i] = puSrc1->ai32[i];
13017 }
13018}
13019
13020
13021/*
13022 * PHADDW / VPHADDW / PHADDD / VPHADDD
13023 */
13024IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13025{
13026 RTUINT64U uSrc1 = { *puDst };
13027 RTUINT64U uSrc2 = { *puSrc };
13028 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13029
13030 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13031 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13032 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13033 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13034 *puDst = uDst.u;
13035 RT_NOREF(pFpuState);
13036}
13037
13038
13039IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13040{
13041 RTUINT128U uSrc1 = *puDst;
13042
13043 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13044 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13045 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13046 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13047
13048 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13049 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13050 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13051 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13052 RT_NOREF(pFpuState);
13053}
13054
13055
13056IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13057{
13058 RTUINT64U uSrc1 = { *puDst };
13059 RTUINT64U uSrc2 = { *puSrc };
13060 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13061
13062 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13063 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
13064 *puDst = uDst.u;
13065 RT_NOREF(pFpuState);
13066}
13067
13068
13069IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13070{
13071 RTUINT128U uSrc1 = *puDst;
13072
13073 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13074 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13075
13076 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13077 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13078 RT_NOREF(pFpuState);
13079}
13080
13081
13082IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13083{
13084 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13085
13086 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13087 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13088 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13089 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13090
13091 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13092 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13093 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13094 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13095
13096 puDst->au64[0] = uDst.au64[0];
13097 puDst->au64[1] = uDst.au64[1];
13098}
13099
13100
13101IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13102{
13103 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13104
13105 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13106 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13107 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13108 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13109 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13110 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13111 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13112 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13113
13114 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13115 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13116 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13117 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13118 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13119 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13120 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13121 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13122
13123 puDst->au64[0] = uDst.au64[0];
13124 puDst->au64[1] = uDst.au64[1];
13125 puDst->au64[2] = uDst.au64[2];
13126 puDst->au64[3] = uDst.au64[3];
13127}
13128
13129
13130IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13131{
13132 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13133
13134 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13135 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13136
13137 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13138 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13139
13140 puDst->au64[0] = uDst.au64[0];
13141 puDst->au64[1] = uDst.au64[1];
13142}
13143
13144
13145IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13146{
13147 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13148
13149 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13150 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13151 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13152 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13153
13154 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13155 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13156 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13157 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13158
13159 puDst->au64[0] = uDst.au64[0];
13160 puDst->au64[1] = uDst.au64[1];
13161 puDst->au64[2] = uDst.au64[2];
13162 puDst->au64[3] = uDst.au64[3];
13163}
13164
13165
13166/*
13167 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13168 */
13169IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13170{
13171 RTUINT64U uSrc1 = { *puDst };
13172 RTUINT64U uSrc2 = { *puSrc };
13173 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13174
13175 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13176 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13177 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13178 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13179 *puDst = uDst.u;
13180 RT_NOREF(pFpuState);
13181}
13182
13183
13184IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13185{
13186 RTUINT128U uSrc1 = *puDst;
13187
13188 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13189 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13190 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13191 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13192
13193 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13194 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13195 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13196 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13197 RT_NOREF(pFpuState);
13198}
13199
13200
13201IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13202{
13203 RTUINT64U uSrc1 = { *puDst };
13204 RTUINT64U uSrc2 = { *puSrc };
13205 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13206
13207 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13208 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13209 *puDst = uDst.u;
13210 RT_NOREF(pFpuState);
13211}
13212
13213
13214IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13215{
13216 RTUINT128U uSrc1 = *puDst;
13217
13218 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13219 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13220
13221 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13222 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13223 RT_NOREF(pFpuState);
13224}
13225
13226
13227IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13228{
13229 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13230
13231 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13232 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13233 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13234 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13235
13236 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13237 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13238 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13239 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13240
13241 puDst->au64[0] = uDst.au64[0];
13242 puDst->au64[1] = uDst.au64[1];
13243}
13244
13245
13246IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13247{
13248 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13249
13250 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13251 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13252 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13253 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13254 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13255 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13256 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13257 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13258
13259 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13260 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13261 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13262 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13263 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13264 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13265 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13266 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13267
13268 puDst->au64[0] = uDst.au64[0];
13269 puDst->au64[1] = uDst.au64[1];
13270 puDst->au64[2] = uDst.au64[2];
13271 puDst->au64[3] = uDst.au64[3];
13272}
13273
13274
13275IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13276{
13277 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13278
13279 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13280 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13281
13282 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13283 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13284
13285 puDst->au64[0] = uDst.au64[0];
13286 puDst->au64[1] = uDst.au64[1];
13287}
13288
13289
13290IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13291{
13292 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13293
13294 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13295 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13296 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13297 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13298
13299 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13300 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13301 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13302 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13303
13304 puDst->au64[0] = uDst.au64[0];
13305 puDst->au64[1] = uDst.au64[1];
13306 puDst->au64[2] = uDst.au64[2];
13307 puDst->au64[3] = uDst.au64[3];
13308}
13309
13310
13311/*
13312 * PHADDSW / VPHADDSW
13313 */
13314IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13315{
13316 RTUINT64U uSrc1 = { *puDst };
13317 RTUINT64U uSrc2 = { *puSrc };
13318 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13319
13320 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13321 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13322 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13323 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13324 *puDst = uDst.u;
13325 RT_NOREF(pFpuState);
13326}
13327
13328
13329IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13330{
13331 RTUINT128U uSrc1 = *puDst;
13332
13333 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13334 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13335 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13336 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13337
13338 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13339 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13340 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13341 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13342 RT_NOREF(pFpuState);
13343}
13344
13345
13346IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13347{
13348 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13349
13350 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13351 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13352 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13353 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13354
13355 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13356 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13357 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13358 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13359
13360 puDst->au64[0] = uDst.au64[0];
13361 puDst->au64[1] = uDst.au64[1];
13362}
13363
13364
13365IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13366{
13367 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13368
13369 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13370 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13371 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13372 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13373 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13374 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13375 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13376 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13377
13378 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13379 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13380 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13381 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13382 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13383 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13384 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13385 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13386
13387 puDst->au64[0] = uDst.au64[0];
13388 puDst->au64[1] = uDst.au64[1];
13389 puDst->au64[2] = uDst.au64[2];
13390 puDst->au64[3] = uDst.au64[3];
13391}
13392
13393
13394/*
13395 * PHSUBSW / VPHSUBSW
13396 */
13397IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13398{
13399 RTUINT64U uSrc1 = { *puDst };
13400 RTUINT64U uSrc2 = { *puSrc };
13401 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13402
13403 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13404 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13405 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13406 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13407 *puDst = uDst.u;
13408 RT_NOREF(pFpuState);
13409}
13410
13411
13412IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13413{
13414 RTUINT128U uSrc1 = *puDst;
13415
13416 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13417 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13418 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13419 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13420
13421 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13422 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13423 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13424 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13425 RT_NOREF(pFpuState);
13426}
13427
13428
13429IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13430{
13431 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13432
13433 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13434 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13435 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13436 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13437
13438 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13439 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13440 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13441 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13442
13443 puDst->au64[0] = uDst.au64[0];
13444 puDst->au64[1] = uDst.au64[1];
13445}
13446
13447
13448IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13449{
13450 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13451
13452 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13453 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13454 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13455 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13456 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13457 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13458 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13459 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13460
13461 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13462 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13463 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13464 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13465 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13466 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13467 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13468 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13469
13470 puDst->au64[0] = uDst.au64[0];
13471 puDst->au64[1] = uDst.au64[1];
13472 puDst->au64[2] = uDst.au64[2];
13473 puDst->au64[3] = uDst.au64[3];
13474}
13475
13476
13477/*
13478 * PMADDUBSW / VPMADDUBSW
13479 */
13480IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13481{
13482 RTUINT64U uSrc1 = { *puDst };
13483 RTUINT64U uSrc2 = { *puSrc };
13484 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13485
13486 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13487 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13488 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13489 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13490 *puDst = uDst.u;
13491 RT_NOREF(pFpuState);
13492}
13493
13494
13495IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13496{
13497 RTUINT128U uSrc1 = *puDst;
13498
13499 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13500 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13501 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13502 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13503 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13504 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13505 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13506 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13507 RT_NOREF(pFpuState);
13508}
13509
13510
13511IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13512{
13513 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13514
13515 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13516 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13517 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13518 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13519 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13520 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13521 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13522 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13523
13524 puDst->au64[0] = uDst.au64[0];
13525 puDst->au64[1] = uDst.au64[1];
13526}
13527
13528
13529IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13530{
13531 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13532
13533 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13534 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13535 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13536 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13537 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13538 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13539 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13540 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13541 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13542 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13543 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13544 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13545 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13546 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13547 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13548 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13549
13550 puDst->au64[0] = uDst.au64[0];
13551 puDst->au64[1] = uDst.au64[1];
13552 puDst->au64[2] = uDst.au64[2];
13553 puDst->au64[3] = uDst.au64[3];
13554}
13555
13556
13557/*
13558 * PMULHRSW / VPMULHRSW
13559 */
13560#define DO_PMULHRSW(a_Src1, a_Src2) \
13561 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13562
13563IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13564{
13565 RTUINT64U uSrc1 = { *puDst };
13566 RTUINT64U uSrc2 = { *puSrc };
13567 RTUINT64U uDst;
13568
13569 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13570 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13571 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13572 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13573 *puDst = uDst.u;
13574 RT_NOREF(pFpuState);
13575}
13576
13577
13578IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13579{
13580 RTUINT128U uSrc1 = *puDst;
13581
13582 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13583 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13584 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13585 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13586 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13587 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13588 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13589 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13590 RT_NOREF(pFpuState);
13591}
13592
13593
13594IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13595{
13596 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13597
13598 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13599 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13600 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13601 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13602 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13603 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13604 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13605 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13606
13607 puDst->au64[0] = uDst.au64[0];
13608 puDst->au64[1] = uDst.au64[1];
13609}
13610
13611
13612IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13613{
13614 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13615
13616 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13617 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13618 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13619 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13620 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13621 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13622 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13623 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13624 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13625 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13626 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13627 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13628 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13629 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13630 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13631 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13632
13633 puDst->au64[0] = uDst.au64[0];
13634 puDst->au64[1] = uDst.au64[1];
13635 puDst->au64[2] = uDst.au64[2];
13636 puDst->au64[3] = uDst.au64[3];
13637}
13638
13639
13640/*
13641 * PSADBW / VPSADBW
13642 */
13643#ifdef IEM_WITHOUT_ASSEMBLY
13644
13645IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13646{
13647 RTUINT64U uSrc1 = { *puDst };
13648 RTUINT64U uSrc2 = { *puSrc };
13649 RTUINT64U uDst;
13650 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13651 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13652 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13653 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13654 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13655 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13656 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13657 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13658
13659 uDst.au64[0] = 0;
13660 uDst.au16[0] = uSum;
13661 *puDst = uDst.u;
13662}
13663
13664
13665IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13666{
13667 RTUINT128U uSrc1 = *puDst;
13668
13669 puDst->au64[0] = 0;
13670 puDst->au64[1] = 0;
13671
13672 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13673 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13674 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13675 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13676 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13677 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13678 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13679 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13680 puDst->au16[0] = uSum;
13681
13682 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13683 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13684 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13685 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13686 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13687 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13688 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13689 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13690 puDst->au16[4] = uSum;
13691}
13692
13693#endif
13694
13695IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13696{
13697 RTUINT128U uSrc1 = *puSrc1;
13698 RTUINT128U uSrc2 = *puSrc2;
13699
13700 puDst->au64[0] = 0;
13701 puDst->au64[1] = 0;
13702
13703 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13704 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13705 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13706 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13707 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13708 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13709 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13710 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13711 puDst->au16[0] = uSum;
13712
13713 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13714 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13715 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13716 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13717 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13718 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13719 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13720 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13721 puDst->au16[4] = uSum;
13722}
13723
13724IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13725{
13726 RTUINT256U uSrc1 = *puSrc1;
13727 RTUINT256U uSrc2 = *puSrc2;
13728
13729 puDst->au64[0] = 0;
13730 puDst->au64[1] = 0;
13731 puDst->au64[2] = 0;
13732 puDst->au64[3] = 0;
13733
13734 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13735 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13736 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13737 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13738 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13739 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13740 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13741 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13742 puDst->au16[0] = uSum;
13743
13744 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13745 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13746 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13747 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13748 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13749 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13750 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13751 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13752 puDst->au16[4] = uSum;
13753
13754 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13755 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13756 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13757 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13758 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13759 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13760 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13761 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13762 puDst->au16[8] = uSum;
13763
13764 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13765 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13766 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13767 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13768 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13769 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13770 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13771 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13772 puDst->au16[12] = uSum;
13773}
13774
13775
13776/*
13777 * PMULDQ / VPMULDQ
13778 */
13779IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13780{
13781 RTUINT128U uSrc1 = *puDst;
13782
13783 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13784 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13785}
13786
13787IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13788{
13789 RTUINT128U uSrc1 = *puSrc1;
13790 RTUINT128U uSrc2 = *puSrc2;
13791
13792 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13793 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13794}
13795
13796IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13797{
13798 RTUINT256U uSrc1 = *puSrc1;
13799 RTUINT256U uSrc2 = *puSrc2;
13800
13801 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13802 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13803 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13804 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13805}
13806
13807
13808/*
13809 * PMULUDQ / VPMULUDQ
13810 */
13811#ifdef IEM_WITHOUT_ASSEMBLY
13812
13813IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13814{
13815 RTUINT64U uSrc1 = { *puDst };
13816 RTUINT64U uSrc2 = { *puSrc };
13817 ASMCompilerBarrier();
13818 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13819 RT_NOREF(pFpuState);
13820}
13821
13822
13823IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13824{
13825 RTUINT128U uSrc1 = *puDst;
13826 RTUINT128U uSrc2 = *puSrc;
13827 ASMCompilerBarrier();
13828 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13829 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13830 RT_NOREF(pFpuState);
13831}
13832
13833#endif
13834
13835IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13836{
13837 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13838 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13839 ASMCompilerBarrier();
13840 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13841 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13842}
13843
13844
13845IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13846{
13847 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13848 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13849 ASMCompilerBarrier();
13850 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13851 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13852 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13853 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13854}
13855
13856
13857/*
13858 * UNPCKLPS / VUNPCKLPS
13859 */
13860#ifdef IEM_WITHOUT_ASSEMBLY
13861IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13862{
13863 RTUINT128U uSrc1 = *puDst;
13864 RTUINT128U uSrc2 = *puSrc;
13865 ASMCompilerBarrier();
13866 puDst->au32[0] = uSrc1.au32[0];
13867 puDst->au32[1] = uSrc2.au32[0];
13868 puDst->au32[2] = uSrc1.au32[1];
13869 puDst->au32[3] = uSrc2.au32[1];
13870}
13871
13872#endif
13873
13874IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13875{
13876 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13877 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13878 ASMCompilerBarrier();
13879 puDst->au32[0] = uSrc1.au32[0];
13880 puDst->au32[1] = uSrc2.au32[0];
13881 puDst->au32[2] = uSrc1.au32[1];
13882 puDst->au32[3] = uSrc2.au32[1];
13883}
13884
13885
13886IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13887{
13888 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13889 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13890 ASMCompilerBarrier();
13891 puDst->au32[0] = uSrc1.au32[0];
13892 puDst->au32[1] = uSrc2.au32[0];
13893 puDst->au32[2] = uSrc1.au32[1];
13894 puDst->au32[3] = uSrc2.au32[1];
13895
13896 puDst->au32[4] = uSrc1.au32[4];
13897 puDst->au32[5] = uSrc2.au32[4];
13898 puDst->au32[6] = uSrc1.au32[5];
13899 puDst->au32[7] = uSrc2.au32[5];
13900}
13901
13902
13903/*
13904 * UNPCKLPD / VUNPCKLPD
13905 */
13906#ifdef IEM_WITHOUT_ASSEMBLY
13907IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13908{
13909 RTUINT128U uSrc1 = *puDst;
13910 RTUINT128U uSrc2 = *puSrc;
13911 ASMCompilerBarrier();
13912 puDst->au64[0] = uSrc1.au64[0];
13913 puDst->au64[1] = uSrc2.au64[0];
13914}
13915
13916#endif
13917
13918IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13919{
13920 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13921 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13922 ASMCompilerBarrier();
13923 puDst->au64[0] = uSrc1.au64[0];
13924 puDst->au64[1] = uSrc2.au64[0];
13925}
13926
13927
13928IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13929{
13930 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13931 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13932 ASMCompilerBarrier();
13933 puDst->au64[0] = uSrc1.au64[0];
13934 puDst->au64[1] = uSrc2.au64[0];
13935 puDst->au64[2] = uSrc1.au64[2];
13936 puDst->au64[3] = uSrc2.au64[2];
13937}
13938
13939
13940/*
13941 * UNPCKHPS / VUNPCKHPS
13942 */
13943#ifdef IEM_WITHOUT_ASSEMBLY
13944IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13945{
13946 RTUINT128U uSrc1 = *puDst;
13947 RTUINT128U uSrc2 = *puSrc;
13948 ASMCompilerBarrier();
13949 puDst->au32[0] = uSrc1.au32[2];
13950 puDst->au32[1] = uSrc2.au32[2];
13951 puDst->au32[2] = uSrc1.au32[3];
13952 puDst->au32[3] = uSrc2.au32[3];
13953}
13954
13955#endif
13956
13957IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13958{
13959 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13960 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13961 ASMCompilerBarrier();
13962 puDst->au32[0] = uSrc1.au32[2];
13963 puDst->au32[1] = uSrc2.au32[2];
13964 puDst->au32[2] = uSrc1.au32[3];
13965 puDst->au32[3] = uSrc2.au32[3];
13966}
13967
13968
13969IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13970{
13971 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13972 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13973 ASMCompilerBarrier();
13974 puDst->au32[0] = uSrc1.au32[2];
13975 puDst->au32[1] = uSrc2.au32[2];
13976 puDst->au32[2] = uSrc1.au32[3];
13977 puDst->au32[3] = uSrc2.au32[3];
13978
13979 puDst->au32[4] = uSrc1.au32[6];
13980 puDst->au32[5] = uSrc2.au32[6];
13981 puDst->au32[6] = uSrc1.au32[7];
13982 puDst->au32[7] = uSrc2.au32[7];
13983}
13984
13985
13986/*
13987 * UNPCKHPD / VUNPCKHPD
13988 */
13989#ifdef IEM_WITHOUT_ASSEMBLY
13990IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13991{
13992 RTUINT128U uSrc1 = *puDst;
13993 RTUINT128U uSrc2 = *puSrc;
13994 ASMCompilerBarrier();
13995 puDst->au64[0] = uSrc1.au64[1];
13996 puDst->au64[1] = uSrc2.au64[1];
13997}
13998
13999#endif
14000
14001IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14002{
14003 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14004 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14005 ASMCompilerBarrier();
14006 puDst->au64[0] = uSrc1.au64[1];
14007 puDst->au64[1] = uSrc2.au64[1];
14008}
14009
14010
14011IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14012{
14013 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14014 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14015 ASMCompilerBarrier();
14016 puDst->au64[0] = uSrc1.au64[1];
14017 puDst->au64[1] = uSrc2.au64[1];
14018 puDst->au64[2] = uSrc1.au64[3];
14019 puDst->au64[3] = uSrc2.au64[3];
14020}
14021
14022
14023/*
14024 * CRC32 (SEE 4.2).
14025 */
14026
14027IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14028{
14029 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14030}
14031
14032
14033IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14034{
14035 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14036}
14037
14038IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14039{
14040 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14041}
14042
14043IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14044{
14045 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14046}
14047
14048
14049/*
14050 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14051 */
14052#ifdef IEM_WITHOUT_ASSEMBLY
14053IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14054{
14055 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14056 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14057 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14058 fEfl |= X86_EFL_ZF;
14059 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14060 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14061 fEfl |= X86_EFL_CF;
14062 *pfEFlags = fEfl;
14063}
14064#endif
14065
14066IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14067{
14068 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14069 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14070 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14071 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14072 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14073 fEfl |= X86_EFL_ZF;
14074 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14075 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14076 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14077 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14078 fEfl |= X86_EFL_CF;
14079 *pfEFlags = fEfl;
14080}
14081
14082
14083/*
14084 * PMOVSXBW / VPMOVSXBW
14085 */
14086IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14087{
14088 RTUINT64U uSrc1 = { uSrc };
14089 puDst->ai16[0] = uSrc1.ai8[0];
14090 puDst->ai16[1] = uSrc1.ai8[1];
14091 puDst->ai16[2] = uSrc1.ai8[2];
14092 puDst->ai16[3] = uSrc1.ai8[3];
14093 puDst->ai16[4] = uSrc1.ai8[4];
14094 puDst->ai16[5] = uSrc1.ai8[5];
14095 puDst->ai16[6] = uSrc1.ai8[6];
14096 puDst->ai16[7] = uSrc1.ai8[7];
14097}
14098
14099
14100IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14101{
14102 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14103 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14104 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14105 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14106 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14107 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14108 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14109 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14110 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14111 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14112 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14113 puDst->ai16[10] = uSrc1.ai8[10];
14114 puDst->ai16[11] = uSrc1.ai8[11];
14115 puDst->ai16[12] = uSrc1.ai8[12];
14116 puDst->ai16[13] = uSrc1.ai8[13];
14117 puDst->ai16[14] = uSrc1.ai8[14];
14118 puDst->ai16[15] = uSrc1.ai8[15];
14119}
14120
14121
14122/*
14123 * PMOVSXBD / VPMOVSXBD
14124 */
14125IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14126{
14127 RTUINT32U uSrc1 = { uSrc };
14128 puDst->ai32[0] = uSrc1.ai8[0];
14129 puDst->ai32[1] = uSrc1.ai8[1];
14130 puDst->ai32[2] = uSrc1.ai8[2];
14131 puDst->ai32[3] = uSrc1.ai8[3];
14132}
14133
14134
14135IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14136{
14137 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14138 puDst->ai32[0] = uSrc1.ai8[0];
14139 puDst->ai32[1] = uSrc1.ai8[1];
14140 puDst->ai32[2] = uSrc1.ai8[2];
14141 puDst->ai32[3] = uSrc1.ai8[3];
14142 puDst->ai32[4] = uSrc1.ai8[4];
14143 puDst->ai32[5] = uSrc1.ai8[5];
14144 puDst->ai32[6] = uSrc1.ai8[6];
14145 puDst->ai32[7] = uSrc1.ai8[7];
14146}
14147
14148
14149/*
14150 * PMOVSXBQ / VPMOVSXBQ
14151 */
14152IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14153{
14154 RTUINT16U uSrc1 = { uSrc };
14155 puDst->ai64[0] = uSrc1.ai8[0];
14156 puDst->ai64[1] = uSrc1.ai8[1];
14157}
14158
14159
14160IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14161{
14162 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14163 puDst->ai64[0] = uSrc1.ai8[0];
14164 puDst->ai64[1] = uSrc1.ai8[1];
14165 puDst->ai64[2] = uSrc1.ai8[2];
14166 puDst->ai64[3] = uSrc1.ai8[3];
14167}
14168
14169
14170/*
14171 * PMOVSXWD / VPMOVSXWD
14172 */
14173IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14174{
14175 RTUINT64U uSrc1 = { uSrc };
14176 puDst->ai32[0] = uSrc1.ai16[0];
14177 puDst->ai32[1] = uSrc1.ai16[1];
14178 puDst->ai32[2] = uSrc1.ai16[2];
14179 puDst->ai32[3] = uSrc1.ai16[3];
14180}
14181
14182
14183IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14184{
14185 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14186 puDst->ai32[0] = uSrc1.ai16[0];
14187 puDst->ai32[1] = uSrc1.ai16[1];
14188 puDst->ai32[2] = uSrc1.ai16[2];
14189 puDst->ai32[3] = uSrc1.ai16[3];
14190 puDst->ai32[4] = uSrc1.ai16[4];
14191 puDst->ai32[5] = uSrc1.ai16[5];
14192 puDst->ai32[6] = uSrc1.ai16[6];
14193 puDst->ai32[7] = uSrc1.ai16[7];
14194}
14195
14196
14197/*
14198 * PMOVSXWQ / VPMOVSXWQ
14199 */
14200IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14201{
14202 RTUINT32U uSrc1 = { uSrc };
14203 puDst->ai64[0] = uSrc1.ai16[0];
14204 puDst->ai64[1] = uSrc1.ai16[1];
14205}
14206
14207
14208IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14209{
14210 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14211 puDst->ai64[0] = uSrc1.ai16[0];
14212 puDst->ai64[1] = uSrc1.ai16[1];
14213 puDst->ai64[2] = uSrc1.ai16[2];
14214 puDst->ai64[3] = uSrc1.ai16[3];
14215}
14216
14217
14218/*
14219 * PMOVSXDQ / VPMOVSXDQ
14220 */
14221IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14222{
14223 RTUINT64U uSrc1 = { uSrc };
14224 puDst->ai64[0] = uSrc1.ai32[0];
14225 puDst->ai64[1] = uSrc1.ai32[1];
14226}
14227
14228
14229IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14230{
14231 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14232 puDst->ai64[0] = uSrc1.ai32[0];
14233 puDst->ai64[1] = uSrc1.ai32[1];
14234 puDst->ai64[2] = uSrc1.ai32[2];
14235 puDst->ai64[3] = uSrc1.ai32[3];
14236}
14237
14238
14239/*
14240 * PMOVZXBW / VPMOVZXBW
14241 */
14242IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14243{
14244 RTUINT64U uSrc1 = { uSrc };
14245 puDst->au16[0] = uSrc1.au8[0];
14246 puDst->au16[1] = uSrc1.au8[1];
14247 puDst->au16[2] = uSrc1.au8[2];
14248 puDst->au16[3] = uSrc1.au8[3];
14249 puDst->au16[4] = uSrc1.au8[4];
14250 puDst->au16[5] = uSrc1.au8[5];
14251 puDst->au16[6] = uSrc1.au8[6];
14252 puDst->au16[7] = uSrc1.au8[7];
14253}
14254
14255
14256IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14257{
14258 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14259 puDst->au16[ 0] = uSrc1.au8[ 0];
14260 puDst->au16[ 1] = uSrc1.au8[ 1];
14261 puDst->au16[ 2] = uSrc1.au8[ 2];
14262 puDst->au16[ 3] = uSrc1.au8[ 3];
14263 puDst->au16[ 4] = uSrc1.au8[ 4];
14264 puDst->au16[ 5] = uSrc1.au8[ 5];
14265 puDst->au16[ 6] = uSrc1.au8[ 6];
14266 puDst->au16[ 7] = uSrc1.au8[ 7];
14267 puDst->au16[ 8] = uSrc1.au8[ 8];
14268 puDst->au16[ 9] = uSrc1.au8[ 9];
14269 puDst->au16[10] = uSrc1.au8[10];
14270 puDst->au16[11] = uSrc1.au8[11];
14271 puDst->au16[12] = uSrc1.au8[12];
14272 puDst->au16[13] = uSrc1.au8[13];
14273 puDst->au16[14] = uSrc1.au8[14];
14274 puDst->au16[15] = uSrc1.au8[15];
14275}
14276
14277
14278/*
14279 * PMOVZXBD / VPMOVZXBD
14280 */
14281IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14282{
14283 RTUINT32U uSrc1 = { uSrc };
14284 puDst->au32[0] = uSrc1.au8[0];
14285 puDst->au32[1] = uSrc1.au8[1];
14286 puDst->au32[2] = uSrc1.au8[2];
14287 puDst->au32[3] = uSrc1.au8[3];
14288}
14289
14290
14291IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14292{
14293 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14294 puDst->au32[0] = uSrc1.au8[0];
14295 puDst->au32[1] = uSrc1.au8[1];
14296 puDst->au32[2] = uSrc1.au8[2];
14297 puDst->au32[3] = uSrc1.au8[3];
14298 puDst->au32[4] = uSrc1.au8[4];
14299 puDst->au32[5] = uSrc1.au8[5];
14300 puDst->au32[6] = uSrc1.au8[6];
14301 puDst->au32[7] = uSrc1.au8[7];
14302}
14303
14304
14305/*
14306 * PMOVZXBQ / VPMOVZXBQ
14307 */
14308IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14309{
14310 RTUINT16U uSrc1 = { uSrc };
14311 puDst->au64[0] = uSrc1.au8[0];
14312 puDst->au64[1] = uSrc1.au8[1];
14313}
14314
14315
14316IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14317{
14318 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14319 puDst->au64[0] = uSrc1.au8[0];
14320 puDst->au64[1] = uSrc1.au8[1];
14321 puDst->au64[2] = uSrc1.au8[2];
14322 puDst->au64[3] = uSrc1.au8[3];
14323}
14324
14325
14326/*
14327 * PMOVZXWD / VPMOVZXWD
14328 */
14329IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14330{
14331 RTUINT64U uSrc1 = { uSrc };
14332 puDst->au32[0] = uSrc1.au16[0];
14333 puDst->au32[1] = uSrc1.au16[1];
14334 puDst->au32[2] = uSrc1.au16[2];
14335 puDst->au32[3] = uSrc1.au16[3];
14336}
14337
14338
14339IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14340{
14341 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14342 puDst->au32[0] = uSrc1.au16[0];
14343 puDst->au32[1] = uSrc1.au16[1];
14344 puDst->au32[2] = uSrc1.au16[2];
14345 puDst->au32[3] = uSrc1.au16[3];
14346 puDst->au32[4] = uSrc1.au16[4];
14347 puDst->au32[5] = uSrc1.au16[5];
14348 puDst->au32[6] = uSrc1.au16[6];
14349 puDst->au32[7] = uSrc1.au16[7];
14350}
14351
14352
14353/*
14354 * PMOVZXWQ / VPMOVZXWQ
14355 */
14356IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14357{
14358 RTUINT32U uSrc1 = { uSrc };
14359 puDst->au64[0] = uSrc1.au16[0];
14360 puDst->au64[1] = uSrc1.au16[1];
14361}
14362
14363
14364IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14365{
14366 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14367 puDst->au64[0] = uSrc1.au16[0];
14368 puDst->au64[1] = uSrc1.au16[1];
14369 puDst->au64[2] = uSrc1.au16[2];
14370 puDst->au64[3] = uSrc1.au16[3];
14371}
14372
14373
14374/*
14375 * PMOVZXDQ / VPMOVZXDQ
14376 */
14377IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14378{
14379 RTUINT64U uSrc1 = { uSrc };
14380 puDst->au64[0] = uSrc1.au32[0];
14381 puDst->au64[1] = uSrc1.au32[1];
14382}
14383
14384
14385IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14386{
14387 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14388 puDst->au64[0] = uSrc1.au32[0];
14389 puDst->au64[1] = uSrc1.au32[1];
14390 puDst->au64[2] = uSrc1.au32[2];
14391 puDst->au64[3] = uSrc1.au32[3];
14392}
14393
14394
14395#ifdef IEM_WITHOUT_ASSEMBLY
14396/**
14397 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14398 * the SoftFloat 32-bit floating point format (float32_t).
14399 *
14400 * This is only a structure format conversion, nothing else.
14401 */
14402DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14403{
14404 float32_t Tmp;
14405 Tmp.v = pr32Val->u;
14406 return Tmp;
14407}
14408
14409
14410/**
14411 * Converts from SoftFloat 32-bit floating point format (float32_t)
14412 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14413 *
14414 * This is only a structure format conversion, nothing else.
14415 */
14416DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14417{
14418 pr32Dst->u = r32XSrc.v;
14419 return pr32Dst;
14420}
14421
14422
14423/**
14424 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14425 * the SoftFloat 64-bit floating point format (float64_t).
14426 *
14427 * This is only a structure format conversion, nothing else.
14428 */
14429DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14430{
14431 float64_t Tmp;
14432 Tmp.v = pr64Val->u;
14433 return Tmp;
14434}
14435
14436
14437/**
14438 * Converts from SoftFloat 64-bit floating point format (float64_t)
14439 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14440 *
14441 * This is only a structure format conversion, nothing else.
14442 */
14443DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14444{
14445 pr64Dst->u = r64XSrc.v;
14446 return pr64Dst;
14447}
14448
14449
14450/** Initializer for the SoftFloat state structure. */
14451# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14452 { \
14453 softfloat_tininess_afterRounding, \
14454 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14455 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14456 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14457 : (uint8_t)softfloat_round_minMag, \
14458 0, \
14459 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14460 32 /* Rounding precision, not relevant for SIMD. */ \
14461 }
14462
14463
14464/**
14465 * Helper for transfering exception to MXCSR and setting the result value
14466 * accordingly.
14467 *
14468 * @returns Updated MXCSR.
14469 * @param pSoftState The SoftFloat state following the operation.
14470 * @param r32Result The result of the SoftFloat operation.
14471 * @param pr32Result Where to store the result for IEM.
14472 * @param fMxcsr The original MXCSR value.
14473 */
14474DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14475 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14476{
14477 iemFpSoftF32ToIprt(pr32Result, r32Result);
14478
14479 uint8_t fXcpt = pSoftState->exceptionFlags;
14480 if ( (fMxcsr & X86_MXCSR_FZ)
14481 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14482 {
14483 /* Underflow masked and flush to zero is set. */
14484 pr32Result->s.uFraction = 0;
14485 pr32Result->s.uExponent = 0;
14486 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14487 }
14488
14489 /* If DAZ is set \#DE is never set. */
14490 if ( fMxcsr & X86_MXCSR_DAZ
14491 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14492 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14493 fXcpt &= ~X86_MXCSR_DE;
14494
14495 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14496}
14497
14498
14499/**
14500 * Helper for transfering exception to MXCSR and setting the result value
14501 * accordingly - ignores Flush-to-Zero.
14502 *
14503 * @returns Updated MXCSR.
14504 * @param pSoftState The SoftFloat state following the operation.
14505 * @param r32Result The result of the SoftFloat operation.
14506 * @param pr32Result Where to store the result for IEM.
14507 * @param fMxcsr The original MXCSR value.
14508 */
14509DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14510 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14511{
14512 iemFpSoftF32ToIprt(pr32Result, r32Result);
14513
14514 uint8_t fXcpt = pSoftState->exceptionFlags;
14515 /* If DAZ is set \#DE is never set. */
14516 if ( fMxcsr & X86_MXCSR_DAZ
14517 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14518 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14519 fXcpt &= ~X86_MXCSR_DE;
14520
14521 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14522}
14523
14524
14525/**
14526 * Helper for transfering exception to MXCSR and setting the result value
14527 * accordingly.
14528 *
14529 * @returns Updated MXCSR.
14530 * @param pSoftState The SoftFloat state following the operation.
14531 * @param r64Result The result of the SoftFloat operation.
14532 * @param pr64Result Where to store the result for IEM.
14533 * @param fMxcsr The original MXCSR value.
14534 */
14535DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14536 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14537{
14538 iemFpSoftF64ToIprt(pr64Result, r64Result);
14539 uint8_t fXcpt = pSoftState->exceptionFlags;
14540 if ( (fMxcsr & X86_MXCSR_FZ)
14541 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14542 {
14543 /* Underflow masked and flush to zero is set. */
14544 iemFpSoftF64ToIprt(pr64Result, r64Result);
14545 pr64Result->s.uFractionHigh = 0;
14546 pr64Result->s.uFractionLow = 0;
14547 pr64Result->s.uExponent = 0;
14548 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14549 }
14550
14551 /* If DAZ is set \#DE is never set. */
14552 if ( fMxcsr & X86_MXCSR_DAZ
14553 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14554 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14555 fXcpt &= ~X86_MXCSR_DE;
14556
14557 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14558}
14559
14560
14561/**
14562 * Helper for transfering exception to MXCSR and setting the result value
14563 * accordingly - ignores Flush-to-Zero.
14564 *
14565 * @returns Updated MXCSR.
14566 * @param pSoftState The SoftFloat state following the operation.
14567 * @param r64Result The result of the SoftFloat operation.
14568 * @param pr64Result Where to store the result for IEM.
14569 * @param fMxcsr The original MXCSR value.
14570 */
14571DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14572 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14573{
14574 iemFpSoftF64ToIprt(pr64Result, r64Result);
14575
14576 uint8_t fXcpt = pSoftState->exceptionFlags;
14577 /* If DAZ is set \#DE is never set. */
14578 if ( fMxcsr & X86_MXCSR_DAZ
14579 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14580 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14581 fXcpt &= ~X86_MXCSR_DE;
14582
14583 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14584}
14585
14586
14587/**
14588 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14589 * in MXCSR into account.
14590 *
14591 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14592 * @param pr32Val Where to store the result.
14593 * @param fMxcsr The input MXCSR value.
14594 * @param pr32Src The value to use.
14595 */
14596DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14597{
14598 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14599 {
14600 if (fMxcsr & X86_MXCSR_DAZ)
14601 {
14602 /* De-normals are changed to 0. */
14603 pr32Val->s.fSign = pr32Src->s.fSign;
14604 pr32Val->s.uFraction = 0;
14605 pr32Val->s.uExponent = 0;
14606 return 0;
14607 }
14608
14609 *pr32Val = *pr32Src;
14610 return X86_MXCSR_DE;
14611 }
14612
14613 *pr32Val = *pr32Src;
14614 return 0;
14615}
14616
14617
14618/**
14619 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14620 * in MXCSR into account.
14621 *
14622 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14623 * @param pr64Val Where to store the result.
14624 * @param fMxcsr The input MXCSR value.
14625 * @param pr64Src The value to use.
14626 */
14627DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14628{
14629 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14630 {
14631 if (fMxcsr & X86_MXCSR_DAZ)
14632 {
14633 /* De-normals are changed to 0. */
14634 pr64Val->s64.fSign = pr64Src->s.fSign;
14635 pr64Val->s64.uFraction = 0;
14636 pr64Val->s64.uExponent = 0;
14637 return 0;
14638 }
14639
14640 *pr64Val = *pr64Src;
14641 return X86_MXCSR_DE;
14642 }
14643
14644 *pr64Val = *pr64Src;
14645 return 0;
14646}
14647
14648
14649/**
14650 * Validates the given input operands returning whether the operation can continue or whether one
14651 * of the source operands contains a NaN value, setting the output accordingly.
14652 *
14653 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14654 * @param pr32Res Where to store the result in case the operation can't continue.
14655 * @param pr32Val1 The first input operand.
14656 * @param pr32Val2 The second input operand.
14657 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14658 */
14659DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14660{
14661 uint8_t cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14662 uint8_t cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14663 if (cSNan + cQNan == 2)
14664 {
14665 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14666 *pr32Res = *pr32Val1;
14667 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14668 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14669 return true;
14670 }
14671 else if (cSNan)
14672 {
14673 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14674 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14675 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14676 *pfMxcsr |= X86_MXCSR_IE;
14677 return true;
14678 }
14679 else if (cQNan)
14680 {
14681 /* The QNan operand is placed into the result. */
14682 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14683 return true;
14684 }
14685
14686 Assert(!cQNan && !cSNan);
14687 return false;
14688}
14689
14690
14691/**
14692 * Validates the given double precision input operands returning whether the operation can continue or whether one
14693 * of the source operands contains a NaN value, setting the output accordingly.
14694 *
14695 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14696 * @param pr64Res Where to store the result in case the operation can't continue.
14697 * @param pr64Val1 The first input operand.
14698 * @param pr64Val2 The second input operand.
14699 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14700 */
14701DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14702{
14703 uint8_t cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14704 uint8_t cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14705 if (cSNan + cQNan == 2)
14706 {
14707 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14708 *pr64Res = *pr64Val1;
14709 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14710 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14711 return true;
14712 }
14713 else if (cSNan)
14714 {
14715 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14716 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14717 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14718 *pfMxcsr |= X86_MXCSR_IE;
14719 return true;
14720 }
14721 else if (cQNan)
14722 {
14723 /* The QNan operand is placed into the result. */
14724 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14725 return true;
14726 }
14727
14728 Assert(!cQNan && !cSNan);
14729 return false;
14730}
14731
14732
14733/**
14734 * Validates the given single input operand returning whether the operation can continue or whether
14735 * contains a NaN value, setting the output accordingly.
14736 *
14737 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14738 * @param pr32Res Where to store the result in case the operation can't continue.
14739 * @param pr32Val The input operand.
14740 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14741 */
14742DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
14743{
14744 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
14745 {
14746 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14747 *pr32Res = *pr32Val;
14748 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14749 *pfMxcsr |= X86_MXCSR_IE;
14750 return true;
14751 }
14752 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
14753 {
14754 /* The QNan operand is placed into the result. */
14755 *pr32Res = *pr32Val;
14756 return true;
14757 }
14758
14759 return false;
14760}
14761
14762
14763/**
14764 * Validates the given double input operand returning whether the operation can continue or whether
14765 * contains a NaN value, setting the output accordingly.
14766 *
14767 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14768 * @param pr64Res Where to store the result in case the operation can't continue.
14769 * @param pr64Val The input operand.
14770 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14771 */
14772DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
14773{
14774 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
14775 {
14776 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14777 *pr64Res = *pr64Val;
14778 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14779 *pfMxcsr |= X86_MXCSR_IE;
14780 return true;
14781 }
14782 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
14783 {
14784 /* The QNan operand is placed into the result. */
14785 *pr64Res = *pr64Val;
14786 return true;
14787 }
14788
14789 return false;
14790}
14791#endif
14792
14793
14794/**
14795 * ADDPS
14796 */
14797#ifdef IEM_WITHOUT_ASSEMBLY
14798static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14799{
14800 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14801 return fMxcsr;
14802
14803 RTFLOAT32U r32Src1, r32Src2;
14804 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14805 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14806 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14807 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14808 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14809}
14810
14811
14812IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14813{
14814 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14815 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14816 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14817 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14818}
14819#endif
14820
14821
14822/**
14823 * ADDSS
14824 */
14825#ifdef IEM_WITHOUT_ASSEMBLY
14826IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14827{
14828 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14829 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14830 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14831 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14832}
14833#endif
14834
14835
14836/**
14837 * ADDPD
14838 */
14839#ifdef IEM_WITHOUT_ASSEMBLY
14840static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14841{
14842 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14843 return fMxcsr;
14844
14845 RTFLOAT64U r64Src1, r64Src2;
14846 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14847 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14848 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14849 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14850 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14851}
14852
14853
14854IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14855{
14856 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14857 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14858}
14859#endif
14860
14861
14862/**
14863 * ADDSD
14864 */
14865#ifdef IEM_WITHOUT_ASSEMBLY
14866IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14867{
14868 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14869 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14870}
14871#endif
14872
14873
14874/**
14875 * MULPS
14876 */
14877#ifdef IEM_WITHOUT_ASSEMBLY
14878static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14879{
14880 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14881 return fMxcsr;
14882
14883 RTFLOAT32U r32Src1, r32Src2;
14884 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14885 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14886 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14887 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14888 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14889}
14890
14891
14892IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14893{
14894 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14895 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14896 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14897 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14898}
14899#endif
14900
14901
14902/**
14903 * MULSS
14904 */
14905#ifdef IEM_WITHOUT_ASSEMBLY
14906IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14907{
14908 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14909 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14910 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14911 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14912}
14913#endif
14914
14915
14916/**
14917 * MULPD
14918 */
14919#ifdef IEM_WITHOUT_ASSEMBLY
14920static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14921{
14922 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14923 return fMxcsr;
14924
14925 RTFLOAT64U r64Src1, r64Src2;
14926 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14927 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14928 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14929 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14930 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14931}
14932
14933
14934IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14935{
14936 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14937 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14938}
14939#endif
14940
14941
14942/**
14943 * MULSD
14944 */
14945#ifdef IEM_WITHOUT_ASSEMBLY
14946IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14947{
14948 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14949 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14950}
14951#endif
14952
14953
14954/**
14955 * SUBPS
14956 */
14957#ifdef IEM_WITHOUT_ASSEMBLY
14958static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14959{
14960 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14961 return fMxcsr;
14962
14963 RTFLOAT32U r32Src1, r32Src2;
14964 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14965 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14966 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14967 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14968 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14969}
14970
14971
14972IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14973{
14974 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14975 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14976 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14977 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14978}
14979#endif
14980
14981
14982/**
14983 * SUBSS
14984 */
14985#ifdef IEM_WITHOUT_ASSEMBLY
14986IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14987{
14988 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14989 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14990 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14991 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14992}
14993#endif
14994
14995
14996/**
14997 * SUBPD
14998 */
14999#ifdef IEM_WITHOUT_ASSEMBLY
15000static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15001{
15002 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15003 return fMxcsr;
15004
15005 RTFLOAT64U r64Src1, r64Src2;
15006 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15007 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15008 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15009 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15010 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15011}
15012
15013
15014IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15015{
15016 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15017 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15018}
15019#endif
15020
15021
15022/**
15023 * SUBSD
15024 */
15025#ifdef IEM_WITHOUT_ASSEMBLY
15026IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15027{
15028 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15029 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15030}
15031#endif
15032
15033
15034/**
15035 * MINPS
15036 */
15037#ifdef IEM_WITHOUT_ASSEMBLY
15038static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15039{
15040 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15041 {
15042 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15043 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15044 return fMxcsr | X86_MXCSR_IE;
15045 }
15046
15047 RTFLOAT32U r32Src1, r32Src2;
15048 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15049 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15050 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15051 {
15052 *pr32Res = r32Src2;
15053 return fMxcsr;
15054 }
15055
15056 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15057 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15058 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15059 fLe
15060 ? iemFpSoftF32FromIprt(&r32Src1)
15061 : iemFpSoftF32FromIprt(&r32Src2),
15062 pr32Res, fMxcsr);
15063}
15064
15065
15066IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15067{
15068 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15069 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15070 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15071 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15072}
15073#endif
15074
15075
15076/**
15077 * MINSS
15078 */
15079#ifdef IEM_WITHOUT_ASSEMBLY
15080IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15081{
15082 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15083 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15084 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15085 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15086}
15087#endif
15088
15089
15090/**
15091 * MINPD
15092 */
15093#ifdef IEM_WITHOUT_ASSEMBLY
15094static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15095{
15096 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15097 {
15098 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15099 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15100 return fMxcsr | X86_MXCSR_IE;
15101 }
15102
15103 RTFLOAT64U r64Src1, r64Src2;
15104 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15105 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15106 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15107 {
15108 *pr64Res = r64Src2;
15109 return fMxcsr;
15110 }
15111
15112 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15113 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15114 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15115 fLe
15116 ? iemFpSoftF64FromIprt(&r64Src1)
15117 : iemFpSoftF64FromIprt(&r64Src2),
15118 pr64Res, fMxcsr);
15119}
15120
15121
15122IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15123{
15124 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15125 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15126}
15127#endif
15128
15129
15130/**
15131 * MINSD
15132 */
15133#ifdef IEM_WITHOUT_ASSEMBLY
15134IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15135{
15136 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15137 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15138}
15139#endif
15140
15141
15142/**
15143 * DIVPS
15144 */
15145#ifdef IEM_WITHOUT_ASSEMBLY
15146static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15147{
15148 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15149 return fMxcsr;
15150
15151 RTFLOAT32U r32Src1, r32Src2;
15152 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15153 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15154 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15155 {
15156 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15157 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15158 {
15159 *pr32Res = g_ar32QNaN[1];
15160 return fMxcsr | X86_MXCSR_IE;
15161 }
15162 else if (RTFLOAT32U_IS_INF(&r32Src1))
15163 {
15164 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15165 return fMxcsr;
15166 }
15167 else
15168 {
15169 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15170 return fMxcsr | X86_MXCSR_ZE;
15171 }
15172 }
15173
15174 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15175 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15176 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15177}
15178
15179
15180IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15181{
15182 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15183 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15184 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15185 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15186}
15187#endif
15188
15189
15190/**
15191 * DIVSS
15192 */
15193#ifdef IEM_WITHOUT_ASSEMBLY
15194IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15195{
15196 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15197 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15198 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15199 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15200}
15201#endif
15202
15203
15204/**
15205 * DIVPD
15206 */
15207#ifdef IEM_WITHOUT_ASSEMBLY
15208static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15209{
15210 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15211 return fMxcsr;
15212
15213 RTFLOAT64U r64Src1, r64Src2;
15214 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15215 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15216 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15217 {
15218 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15219 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15220 {
15221 *pr64Res = g_ar64QNaN[1];
15222 return fMxcsr | X86_MXCSR_IE;
15223 }
15224 else if (RTFLOAT64U_IS_INF(&r64Src1))
15225 {
15226 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15227 return fMxcsr;
15228 }
15229 else
15230 {
15231 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15232 return fMxcsr | X86_MXCSR_ZE;
15233 }
15234 }
15235
15236 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15237 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15238 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15239}
15240
15241
15242IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15243{
15244 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15245 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15246}
15247#endif
15248
15249
15250/**
15251 * DIVSD
15252 */
15253#ifdef IEM_WITHOUT_ASSEMBLY
15254IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15255{
15256 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15257 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15258}
15259#endif
15260
15261
15262/**
15263 * MAXPS
15264 */
15265#ifdef IEM_WITHOUT_ASSEMBLY
15266static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15267{
15268 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15269 {
15270 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15271 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15272 return fMxcsr | X86_MXCSR_IE;
15273 }
15274
15275 RTFLOAT32U r32Src1, r32Src2;
15276 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15277 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15278 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15279 {
15280 *pr32Res = r32Src2;
15281 return fMxcsr;
15282 }
15283
15284 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15285 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15286 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15287 fLe
15288 ? iemFpSoftF32FromIprt(&r32Src2)
15289 : iemFpSoftF32FromIprt(&r32Src1),
15290 pr32Res, fMxcsr);
15291}
15292
15293
15294IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15295{
15296 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15297 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15298 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15299 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15300}
15301#endif
15302
15303
15304/**
15305 * MAXSS
15306 */
15307#ifdef IEM_WITHOUT_ASSEMBLY
15308IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15309{
15310 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15311 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15312 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15313 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15314}
15315#endif
15316
15317
15318/**
15319 * MAXPD
15320 */
15321#ifdef IEM_WITHOUT_ASSEMBLY
15322static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15323{
15324 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15325 {
15326 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15327 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15328 return fMxcsr | X86_MXCSR_IE;
15329 }
15330
15331 RTFLOAT64U r64Src1, r64Src2;
15332 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15333 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15334 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15335 {
15336 *pr64Res = r64Src2;
15337 return fMxcsr;
15338 }
15339
15340 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15341 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15342 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15343 fLe
15344 ? iemFpSoftF64FromIprt(&r64Src2)
15345 : iemFpSoftF64FromIprt(&r64Src1),
15346 pr64Res, fMxcsr);
15347}
15348
15349
15350IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15351{
15352 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15353 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15354}
15355#endif
15356
15357
15358/**
15359 * MAXSD
15360 */
15361#ifdef IEM_WITHOUT_ASSEMBLY
15362IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15363{
15364 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15365 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15366}
15367#endif
15368
15369
15370/**
15371 * CVTSS2SD
15372 */
15373#ifdef IEM_WITHOUT_ASSEMBLY
15374static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15375{
15376 RTFLOAT32U r32Src1;
15377 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15378
15379 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15380 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15381 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15382}
15383
15384
15385IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15386{
15387 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15388 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15389}
15390#endif
15391
15392
15393/**
15394 * CVTSD2SS
15395 */
15396#ifdef IEM_WITHOUT_ASSEMBLY
15397static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15398{
15399 RTFLOAT64U r64Src1;
15400 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15401
15402 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15403 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15404 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15405}
15406
15407
15408IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15409{
15410 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15411 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15412 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15413 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15414}
15415#endif
15416
15417
15418/**
15419 * HADDPS
15420 */
15421#ifdef IEM_WITHOUT_ASSEMBLY
15422IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15423{
15424 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15425 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15426 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15427 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15428}
15429#endif
15430
15431
15432/**
15433 * HADDPD
15434 */
15435#ifdef IEM_WITHOUT_ASSEMBLY
15436IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15437{
15438 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15439 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15440}
15441#endif
15442
15443
15444/**
15445 * HSUBPS
15446 */
15447#ifdef IEM_WITHOUT_ASSEMBLY
15448IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15449{
15450 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15451 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15452 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15453 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15454}
15455#endif
15456
15457
15458/**
15459 * HSUBPD
15460 */
15461#ifdef IEM_WITHOUT_ASSEMBLY
15462IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15463{
15464 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15465 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15466}
15467#endif
15468
15469
15470/**
15471 * SQRTPS
15472 */
15473#ifdef IEM_WITHOUT_ASSEMBLY
15474static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15475{
15476 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15477 return fMxcsr;
15478
15479 RTFLOAT32U r32Src;
15480 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15481 if (RTFLOAT32U_IS_ZERO(&r32Src))
15482 {
15483 *pr32Res = r32Src;
15484 return fMxcsr;
15485 }
15486 else if (r32Src.s.fSign)
15487 {
15488 *pr32Res = g_ar32QNaN[1];
15489 return fMxcsr | X86_MXCSR_IE;
15490 }
15491
15492 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15493 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15494 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15495}
15496
15497
15498IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15499{
15500 RT_NOREF(puSrc1);
15501
15502 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15503 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15504 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15505 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15506}
15507#endif
15508
15509
15510/**
15511 * SQRTSS
15512 */
15513#ifdef IEM_WITHOUT_ASSEMBLY
15514IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15515{
15516 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15517 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15518 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15519 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15520}
15521#endif
15522
15523
15524/**
15525 * SQRTPD
15526 */
15527#ifdef IEM_WITHOUT_ASSEMBLY
15528static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15529{
15530 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15531 return fMxcsr;
15532
15533 RTFLOAT64U r64Src;
15534 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15535 if (RTFLOAT64U_IS_ZERO(&r64Src))
15536 {
15537 *pr64Res = r64Src;
15538 return fMxcsr;
15539 }
15540 else if (r64Src.s.fSign)
15541 {
15542 *pr64Res = g_ar64QNaN[1];
15543 return fMxcsr | X86_MXCSR_IE;
15544 }
15545
15546 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15547 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15548 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15549}
15550
15551
15552IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15553{
15554 RT_NOREF(puSrc1);
15555
15556 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15557 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15558}
15559#endif
15560
15561
15562/**
15563 * SQRTSD
15564 */
15565#ifdef IEM_WITHOUT_ASSEMBLY
15566IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15567{
15568 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15569 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15570}
15571#endif
15572
15573
15574/**
15575 * ADDSUBPS
15576 */
15577#ifdef IEM_WITHOUT_ASSEMBLY
15578IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15579{
15580 RT_NOREF(puSrc1);
15581
15582 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15583 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15584 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15585 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15586}
15587#endif
15588
15589
15590/**
15591 * ADDSUBPD
15592 */
15593#ifdef IEM_WITHOUT_ASSEMBLY
15594IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15595{
15596 RT_NOREF(puSrc1);
15597
15598 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15599 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15600}
15601#endif
15602
15603
15604/**
15605 * CVTPD2PS
15606 */
15607#ifdef IEM_WITHOUT_ASSEMBLY
15608static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15609{
15610 RTFLOAT64U r64Src1;
15611 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15612
15613 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15614 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15615 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15616}
15617
15618
15619IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15620{
15621 RT_NOREF(puSrc1);
15622
15623 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15624 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15625 pResult->uResult.au32[2] = 0;
15626 pResult->uResult.au32[3] = 0;
15627}
15628#endif
15629
15630
15631/**
15632 * CVTPS2PD
15633 */
15634#ifdef IEM_WITHOUT_ASSEMBLY
15635static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15636{
15637 RTFLOAT32U r32Src1;
15638 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15639
15640 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15641 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15642 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15643}
15644
15645
15646IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15647{
15648 RT_NOREF(puSrc1);
15649
15650 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15651 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15652}
15653#endif
15654
15655
15656/**
15657 * CVTDQ2PS
15658 */
15659#ifdef IEM_WITHOUT_ASSEMBLY
15660static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
15661{
15662 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15663 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
15664 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15665}
15666
15667
15668IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15669{
15670 RT_NOREF(puSrc1);
15671
15672 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15673 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15674 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
15675 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
15676}
15677#endif
15678
15679
15680/**
15681 * CVTPS2DQ
15682 */
15683#ifdef IEM_WITHOUT_ASSEMBLY
15684static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15685{
15686 RTFLOAT32U r32Src;
15687 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15688
15689 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15690 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15691 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15692}
15693
15694
15695IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15696{
15697 RT_NOREF(puSrc1);
15698
15699 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15700 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15701 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15702 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15703}
15704#endif
15705
15706
15707/**
15708 * CVTTPS2DQ
15709 */
15710#ifdef IEM_WITHOUT_ASSEMBLY
15711static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15712{
15713 RTFLOAT32U r32Src;
15714 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15715
15716 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15717 SoftState.roundingMode = softfloat_round_minMag;
15718 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
15719 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15720}
15721
15722
15723IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15724{
15725 RT_NOREF(puSrc1);
15726
15727 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15728 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15729 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15730 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15731}
15732#endif
15733
15734
15735/**
15736 * CVTTPD2DQ
15737 */
15738#ifdef IEM_WITHOUT_ASSEMBLY
15739static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15740{
15741 RTFLOAT64U r64Src;
15742 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15743
15744 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15745 SoftState.roundingMode = softfloat_round_minMag;
15746 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15747 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15748}
15749
15750
15751IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15752{
15753 RT_NOREF(puSrc1);
15754
15755 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15756 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15757 pResult->uResult.au64[1] = 0;
15758}
15759#endif
15760
15761
15762/**
15763 * CVTDQ2PD
15764 */
15765#ifdef IEM_WITHOUT_ASSEMBLY
15766static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
15767{
15768 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15769 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
15770 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15771}
15772
15773
15774IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15775{
15776 RT_NOREF(puSrc1);
15777
15778 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15779 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15780}
15781#endif
15782
15783
15784/**
15785 * CVTPD2DQ
15786 */
15787#ifdef IEM_WITHOUT_ASSEMBLY
15788static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15789{
15790 RTFLOAT64U r64Src;
15791 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15792
15793 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15794 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15795 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15796}
15797
15798
15799IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15800{
15801 RT_NOREF(puSrc1);
15802
15803 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15804 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15805 pResult->uResult.au64[1] = 0;
15806}
15807#endif
15808
15809
15810/**
15811 * [V]SHUFPS
15812 */
15813#ifdef IEM_WITHOUT_ASSEMBLY
15814IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15815{
15816 RTUINT128U const uSrc1 = *puDst;
15817 RTUINT128U const uSrc2 = *puSrc;
15818 ASMCompilerBarrier();
15819 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15820 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15821 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15822 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15823}
15824#endif
15825
15826
15827IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15828{
15829 RTUINT128U const uSrc1 = *puSrc1;
15830 RTUINT128U const uSrc2 = *puSrc2;
15831 ASMCompilerBarrier();
15832 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15833 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15834 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15835 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15836}
15837
15838
15839IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15840{
15841 RTUINT256U const uSrc1 = *puSrc1;
15842 RTUINT256U const uSrc2 = *puSrc2;
15843 ASMCompilerBarrier();
15844 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15845 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15846 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15847 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15848
15849 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
15850 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
15851 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
15852 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
15853}
15854
15855
15856/**
15857 * [V]SHUFPD
15858 */
15859#ifdef IEM_WITHOUT_ASSEMBLY
15860IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15861{
15862 RTUINT128U const uSrc1 = *puDst;
15863 RTUINT128U const uSrc2 = *puSrc;
15864 ASMCompilerBarrier();
15865 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15866 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15867}
15868#endif
15869
15870
15871IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15872{
15873 RTUINT128U const uSrc1 = *puSrc1;
15874 RTUINT128U const uSrc2 = *puSrc2;
15875 ASMCompilerBarrier();
15876 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15877 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15878}
15879
15880
15881IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15882{
15883 RTUINT256U const uSrc1 = *puSrc1;
15884 RTUINT256U const uSrc2 = *puSrc2;
15885 ASMCompilerBarrier();
15886 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15887 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15888 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
15889 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
15890}
15891
15892
15893/*
15894 * PHMINPOSUW / VPHMINPOSUW
15895 */
15896IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15897{
15898 uint16_t u16Min = puSrc->au16[0];
15899 uint8_t idxMin = 0;
15900
15901 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
15902 if (puSrc->au16[i] < u16Min)
15903 {
15904 u16Min = puSrc->au16[i];
15905 idxMin = i;
15906 }
15907
15908 puDst->au64[0] = 0;
15909 puDst->au64[1] = 0;
15910 puDst->au16[0] = u16Min;
15911 puDst->au16[1] = idxMin;
15912}
15913
15914
15915IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15916{
15917 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
15918}
15919
15920
15921/*
15922 * [V]PBLENDVB
15923 */
15924IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15925{
15926 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15927 if (puMask->au8[i] & RT_BIT(7))
15928 puDst->au8[i] = puSrc->au8[i];
15929}
15930
15931
15932IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15933{
15934 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15935 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15936}
15937
15938
15939IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15940{
15941 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15942 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15943}
15944
15945
15946/*
15947 * [V]BLENDVPS
15948 */
15949IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15950{
15951 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15952 if (puMask->au32[i] & RT_BIT_32(31))
15953 puDst->au32[i] = puSrc->au32[i];
15954}
15955
15956
15957IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15958{
15959 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15960 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15961}
15962
15963
15964IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15965{
15966 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15967 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15968}
15969
15970
15971/*
15972 * [V]BLENDVPD
15973 */
15974IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15975{
15976 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
15977 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
15978}
15979
15980
15981IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15982{
15983 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15984 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15985}
15986
15987
15988IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15989{
15990 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15991 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15992}
15993
15994
15995/**
15996 * [V]PALIGNR
15997 */
15998IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
15999{
16000 uint64_t const u64Src1 = *pu64Dst;
16001 ASMCompilerBarrier();
16002
16003 if (bEvil >= 16)
16004 *pu64Dst = 0;
16005 else if (bEvil >= 8)
16006 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
16007 else
16008 {
16009 uint8_t cShift = bEvil * 8;
16010 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
16011 | (u64Src2 >> cShift);
16012 }
16013}
16014
16015
16016IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16017{
16018 RTUINT128U const uSrc1 = *puDst;
16019 RTUINT128U const uSrc2 = *puSrc;
16020 ASMCompilerBarrier();
16021
16022 puDst->au64[0] = 0;
16023 puDst->au64[1] = 0;
16024 if (bEvil >= 32)
16025 { /* Everything stays 0. */ }
16026 else if (bEvil >= 16)
16027 {
16028 bEvil -= 16;
16029 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16030 puDst->au8[i - bEvil] = uSrc1.au8[i];
16031 }
16032 else
16033 {
16034 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16035 puDst->au8[i] = uSrc2.au8[i + bEvil];
16036 for (uint8_t i = 0; i < bEvil; i++)
16037 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16038 }
16039}
16040
16041
16042IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16043{
16044 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16045 RTUINT128U const uSrc2 = *puSrc2;
16046 ASMCompilerBarrier();
16047
16048 puDst->au64[0] = 0;
16049 puDst->au64[1] = 0;
16050 if (bEvil >= 32)
16051 { /* Everything stays 0. */ }
16052 else if (bEvil >= 16)
16053 {
16054 bEvil -= 16;
16055 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16056 puDst->au8[i - bEvil] = uSrc1.au8[i];
16057 }
16058 else
16059 {
16060 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16061 puDst->au8[i] = uSrc2.au8[i + bEvil];
16062 for (uint8_t i = 0; i < bEvil; i++)
16063 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16064 }
16065}
16066
16067
16068IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16069{
16070 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16071 RTUINT256U const uSrc2 = *puSrc2;
16072 ASMCompilerBarrier();
16073
16074 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16075 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16076}
16077
16078
16079/**
16080 * [V]PBLENDW
16081 */
16082IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16083{
16084 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16085 if (bEvil & RT_BIT(i))
16086 puDst->au16[i] = puSrc->au16[i];
16087}
16088
16089
16090IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16091{
16092 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16093 if (bEvil & RT_BIT(i))
16094 puDst->au16[i] = puSrc2->au16[i];
16095 else
16096 puDst->au16[i] = puSrc1->au16[i];
16097}
16098
16099
16100IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16101{
16102 for (uint8_t i = 0; i < 8; i++)
16103 if (bEvil & RT_BIT(i))
16104 {
16105 puDst->au16[ i] = puSrc2->au16[ i];
16106 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16107 }
16108 else
16109 {
16110 puDst->au16[ i] = puSrc1->au16[ i];
16111 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16112 }
16113}
16114
16115
16116/**
16117 * [V]BLENDPS
16118 */
16119IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16120{
16121 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16122 if (bEvil & RT_BIT(i))
16123 puDst->au32[i] = puSrc->au32[i];
16124}
16125
16126
16127IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16128{
16129 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16130 if (bEvil & RT_BIT(i))
16131 puDst->au32[i] = puSrc2->au32[i];
16132 else
16133 puDst->au32[i] = puSrc1->au32[i];
16134}
16135
16136
16137IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16138{
16139 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16140 if (bEvil & RT_BIT(i))
16141 puDst->au32[i] = puSrc2->au32[i];
16142 else
16143 puDst->au32[i] = puSrc1->au32[i];
16144}
16145
16146
16147/**
16148 * [V]BLENDPD
16149 */
16150IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16151{
16152 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16153 if (bEvil & RT_BIT(i))
16154 puDst->au64[i] = puSrc->au64[i];
16155}
16156
16157
16158IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16159{
16160 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16161 if (bEvil & RT_BIT(i))
16162 puDst->au64[i] = puSrc2->au64[i];
16163 else
16164 puDst->au64[i] = puSrc1->au64[i];
16165}
16166
16167
16168IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16169{
16170 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16171 if (bEvil & RT_BIT(i))
16172 puDst->au64[i] = puSrc2->au64[i];
16173 else
16174 puDst->au64[i] = puSrc1->au64[i];
16175}
16176
16177
16178/**
16179 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16180 */
16181
16182static uint8_t iemAImpl_aes_sbox[] = {
16183 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16184 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16185 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16186 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16187 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16188 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16189 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16190 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16191 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16192 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
16193 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16194 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
16195 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
16196 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
16197 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
16198 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
16199};
16200
16201/* The InvS-Box lookup table. */
16202static uint8_t iemAImpl_aes_inv_sbox[] = {
16203 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
16204 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
16205 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
16206 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
16207 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
16208 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
16209 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
16210 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
16211 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
16212 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
16213 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
16214 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
16215 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
16216 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
16217 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
16218 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
16219};
16220
16221/* The ShiftRows lookup table. */
16222static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
16223 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
16224};
16225
16226/* The InvShiftRows lookup table. */
16227static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
16228 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
16229};
16230
16231static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
16232{
16233 RTUINT128U uVal;
16234 int i;
16235
16236 for (i = 0; i < 16; ++i)
16237 uVal.au8[i] = abSubst[puSrc->au8[i]];
16238
16239 return uVal;
16240}
16241
16242static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
16243{
16244 return (u << 1) ^ (((u >> 7) & 1) * 27);
16245}
16246
16247static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
16248{
16249 RTUINT128U uVal;
16250 int i;
16251 uint8_t tmp;
16252
16253 for (i = 0; i < 16; i += 4) {
16254 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
16255 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
16256 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
16257 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
16258 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
16259 }
16260
16261 return uVal;
16262}
16263
16264static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
16265{
16266 RTUINT128U uVal;
16267 int i;
16268
16269 for (i = 0; i < 16; ++i)
16270 uVal.au8[i] = puSrc->au8[abShift[i]];
16271
16272 return uVal;
16273}
16274
16275static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
16276{
16277 uint8_t val;
16278
16279 val = ((b >> 0) & 1) * a;
16280 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
16281 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
16282 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
16283 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
16284
16285 return val;
16286}
16287
16288static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
16289{
16290 RTUINT128U uVal;
16291 int i;
16292
16293 for (i = 0; i < 16; i += 4) {
16294 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
16295 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
16296 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
16297 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
16298 }
16299
16300 return uVal;
16301}
16302
16303static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
16304{
16305 RTUINT32U uTmp;
16306
16307 uTmp.au32[0] = w;
16308 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
16309 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
16310 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
16311 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
16312
16313 return uTmp.au32[0];
16314}
16315
16316static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
16317{
16318 return (w << 24) | (w >> 8);
16319}
16320
16321/**
16322 * [V]AESKEYGENASSIST
16323 */
16324IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
16325{
16326 RTUINT128U uTmp;
16327 uint32_t uRCon = bImm; /* Round constant. */
16328
16329 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
16330 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
16331 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
16332 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
16333
16334 *puDst = uTmp;
16335}
16336
16337
16338/**
16339 * [V]AESIMC
16340 */
16341IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16342{
16343 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
16344}
16345
16346
16347/**
16348 * [V]AESENC
16349 */
16350IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16351{
16352 RTUINT128U uTmp;
16353
16354 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16355 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16356 uTmp = iemAImpl_aes_mix_col(&uTmp);
16357 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16358 uTmp.au64[1] ^= puSrc->au64[1];
16359
16360 *puDst = uTmp;
16361}
16362
16363
16364/**
16365 * [V]AESENCLAST
16366 */
16367IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16368{
16369 RTUINT128U uTmp;
16370
16371 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16372 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16373 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16374 uTmp.au64[1] ^= puSrc->au64[1];
16375
16376 *puDst = uTmp;
16377}
16378
16379
16380/**
16381 * [V]AESDEC
16382 */
16383IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16384{
16385 RTUINT128U uTmp;
16386
16387 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16388 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16389 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
16390 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16391 uTmp.au64[1] ^= puSrc->au64[1];
16392
16393 *puDst = uTmp;
16394}
16395
16396
16397/**
16398 * [V]AESDECLAST
16399 */
16400IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16401{
16402 RTUINT128U uTmp;
16403
16404 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16405 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16406 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16407 uTmp.au64[1] ^= puSrc->au64[1];
16408
16409 *puDst = uTmp;
16410}
16411
16412
16413/**
16414 * [V]PCMPISTRI
16415 */
16416IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRISRC pSrc, uint8_t bEvil))
16417{
16418 RT_NOREF(pu32Ecx, pEFlags, pSrc, bEvil);
16419 AssertReleaseFailed();
16420}
16421
16422
16423/*
16424 * [V]PCLMULQDQ
16425 */
16426IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16427{
16428 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
16429}
16430
16431
16432IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16433{
16434 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
16435 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
16436
16437 puDst->au64[0] = 0;
16438 puDst->au64[1] = 0;
16439
16440 /*
16441 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
16442 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
16443 * and squeeze out some optimizations.
16444 */
16445 if (uSrc1 & 0x1)
16446 puDst->au64[0] = uSrc2;
16447
16448 uSrc1 >>= 1;
16449
16450 uint8_t iDigit = 1;
16451 while (uSrc1)
16452 {
16453 if (uSrc1 & 0x1)
16454 {
16455 puDst->au64[0] ^= (uSrc2 << iDigit);
16456 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
16457 }
16458
16459 uSrc1 >>= 1;
16460 iDigit++;
16461 }
16462}
16463
16464
16465/**
16466 * [V]PINSRW
16467 */
16468#ifdef IEM_WITHOUT_ASSEMBLY
16469IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
16470{
16471 uint8_t cShift = (bEvil & 0x3) * 16;
16472 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
16473}
16474
16475
16476IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
16477{
16478 puDst->au16[bEvil & 0x7] = u16Src;
16479}
16480#endif
16481
16482
16483IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
16484{
16485 *puDst = *puSrc;
16486 puDst->au16[bEvil & 0x7] = u16Src;
16487}
16488
16489
16490/**
16491 * [V]PEXTRW
16492 */
16493#ifdef IEM_WITHOUT_ASSEMBLY
16494IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
16495{
16496 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
16497}
16498
16499
16500IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16501{
16502 *pu16Dst = puSrc->au16[bEvil & 0x7];
16503}
16504
16505#endif
16506
16507IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16508{
16509 *pu16Dst = puSrc->au16[bEvil & 0x7];
16510}
16511
16512
16513/**
16514 * [V]MOVMSKPS
16515 */
16516#ifdef IEM_WITHOUT_ASSEMBLY
16517IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16518{
16519 *pu8Dst = puSrc->au32[0] >> 31;
16520 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16521 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16522 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16523}
16524
16525#endif
16526
16527IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16528{
16529 *pu8Dst = puSrc->au32[0] >> 31;
16530 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16531 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16532 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16533}
16534
16535
16536IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16537{
16538 *pu8Dst = puSrc->au32[0] >> 31;
16539 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16540 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16541 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16542 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
16543 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
16544 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
16545 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
16546}
16547
16548
16549/**
16550 * [V]MOVMSKPD
16551 */
16552#ifdef IEM_WITHOUT_ASSEMBLY
16553IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16554{
16555 *pu8Dst = puSrc->au64[0] >> 63;
16556 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16557}
16558
16559#endif
16560
16561IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16562{
16563 *pu8Dst = puSrc->au64[0] >> 63;
16564 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16565}
16566
16567
16568IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16569{
16570 *pu8Dst = puSrc->au64[0] >> 63;
16571 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16572 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
16573 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
16574}
16575
16576
16577/**
16578 * CVTTSD2SI
16579 */
16580#ifdef IEM_WITHOUT_ASSEMBLY
16581IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16582{
16583 RTFLOAT64U r64Src;
16584
16585 r64Src.u = *pu64Src;
16586 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16587
16588 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16589 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16590 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16591}
16592
16593
16594IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16595{
16596 RTFLOAT64U r64Src;
16597
16598 r64Src.u = *pu64Src;
16599 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16600
16601 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16602 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16603 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16604}
16605#endif
16606
16607
16608/**
16609 * CVTSD2SI
16610 */
16611#ifdef IEM_WITHOUT_ASSEMBLY
16612IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16613{
16614 RTFLOAT64U r64Src;
16615
16616 r64Src.u = *pu64Src;
16617 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16618
16619 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16620 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16621 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16622}
16623
16624
16625IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16626{
16627 RTFLOAT64U r64Src;
16628
16629 r64Src.u = *pu64Src;
16630 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16631
16632 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16633 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16634 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16635}
16636#endif
16637
16638
16639/**
16640 * CVTTSS2SI
16641 */
16642#ifdef IEM_WITHOUT_ASSEMBLY
16643IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
16644{
16645 RTFLOAT32U r32Src;
16646
16647 r32Src.u = *pu32Src;
16648 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16649
16650 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16651 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16652 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16653}
16654
16655
16656IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
16657{
16658 RTFLOAT32U r32Src;
16659
16660 r32Src.u = *pu32Src;
16661 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16662
16663 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16664 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16665 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16666}
16667#endif
16668
16669
16670/**
16671 * CVTSS2SI
16672 */
16673#ifdef IEM_WITHOUT_ASSEMBLY
16674IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
16675{
16676 RTFLOAT32U r32Src;
16677
16678 r32Src.u = *pu32Src;
16679 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16680
16681 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16682 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16683 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16684}
16685
16686
16687IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
16688{
16689 RTFLOAT32U r32Src;
16690
16691 r32Src.u = *pu32Src;
16692 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16693
16694 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16695 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16696 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16697}
16698#endif
16699
16700
16701/**
16702 * CVTSI2SD
16703 */
16704#ifdef IEM_WITHOUT_ASSEMBLY
16705IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
16706{
16707 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16708 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
16709 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
16710}
16711
16712
16713IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
16714{
16715 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16716 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
16717 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
16718}
16719#endif
16720
16721
16722/**
16723 * CVTSI2SS
16724 */
16725#ifdef IEM_WITHOUT_ASSEMBLY
16726IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
16727{
16728 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16729 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
16730 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
16731}
16732
16733
16734IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
16735{
16736 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16737 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
16738 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
16739}
16740#endif
16741
16742
16743/**
16744 * [V]UCOMISS
16745 */
16746#ifdef IEM_WITHOUT_ASSEMBLY
16747IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16748{
16749 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16750
16751 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
16752 {
16753 *pfMxcsr |= X86_MXCSR_IE;
16754 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16755 }
16756 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
16757 {
16758 /* ucomiss doesn't raise \#IE for quiet NaNs. */
16759 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16760 }
16761 else
16762 {
16763 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16764
16765 RTFLOAT32U r32Src1, r32Src2;
16766 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
16767 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
16768
16769 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16770 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16771 if (f32_eq(f32Src1, f32Src2, &SoftState))
16772 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16773 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16774 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16775 /* else: GREATER_THAN 000 */
16776
16777 *pfMxcsr |= fDe;
16778 }
16779
16780 *pfEFlags = fEFlagsNew;
16781}
16782#endif
16783
16784IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16785{
16786 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16787}
16788
16789
16790/**
16791 * [V]UCOMISD
16792 */
16793#ifdef IEM_WITHOUT_ASSEMBLY
16794IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16795{
16796 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16797
16798 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
16799 {
16800 *pfMxcsr |= X86_MXCSR_IE;
16801 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16802 }
16803 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
16804 {
16805 /* ucomiss doesn't raise \#IE for quiet NaNs. */
16806 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16807 }
16808 else
16809 {
16810 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16811
16812 RTFLOAT64U r64Src1, r64Src2;
16813 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
16814 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
16815
16816 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16817 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16818 if (f64_eq(f64Src1, f64Src2, &SoftState))
16819 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16820 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16821 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16822 /* else: GREATER_THAN 000 */
16823
16824 *pfMxcsr |= fDe;
16825 }
16826
16827 *pfEFlags = fEFlagsNew;
16828}
16829#endif
16830
16831IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16832{
16833 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16834}
16835
16836
16837/**
16838 * [V]COMISS
16839 */
16840#ifdef IEM_WITHOUT_ASSEMBLY
16841IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16842{
16843 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16844
16845 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
16846 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
16847 {
16848 *pfMxcsr |= X86_MXCSR_IE;
16849 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16850 }
16851 else
16852 {
16853 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16854
16855 RTFLOAT32U r32Src1, r32Src2;
16856 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
16857 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
16858
16859 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16860 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16861 if (f32_eq(f32Src1, f32Src2, &SoftState))
16862 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16863 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16864 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16865 /* else: GREATER_THAN 000 */
16866
16867 *pfMxcsr |= fDe;
16868 }
16869
16870 *pfEFlags = fEFlagsNew;
16871}
16872#endif
16873
16874
16875IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16876{
16877 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16878}
16879
16880
16881/**
16882 * [V]COMISD
16883 */
16884#ifdef IEM_WITHOUT_ASSEMBLY
16885IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16886{
16887 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16888
16889 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
16890 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
16891 {
16892 *pfMxcsr |= X86_MXCSR_IE;
16893 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16894 }
16895 else
16896 {
16897 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16898
16899 RTFLOAT64U r64Src1, r64Src2;
16900 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
16901 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
16902
16903 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16904 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16905 if (f64_eq(f64Src1, f64Src2, &SoftState))
16906 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16907 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16908 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16909 /* else: GREATER_THAN 000 */
16910
16911 *pfMxcsr |= fDe;
16912 }
16913
16914 *pfEFlags = fEFlagsNew;
16915}
16916#endif
16917
16918IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16919{
16920 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16921}
16922
16923
16924/**
16925 * CMPPS / CMPPD / CMPSS / CMPSD
16926 */
16927#ifdef IEM_WITHOUT_ASSEMBLY
16928/**
16929 * A compare truth table entry.
16930 */
16931typedef struct CMPTRUTHTBLENTRY
16932{
16933 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
16934 bool fSignalsOnQNan;
16935 /** The boolean result when the input operands are unordered. */
16936 bool fUnordered;
16937 /** The boolean result when A = B. */
16938 bool fEqual;
16939 /** The boolean result when A < B. */
16940 bool fLowerThan;
16941 /** The boolean result when A > B. */
16942 bool fGreaterThan;
16943} CMPTRUTHTBLENTRY;
16944/** Pointer to a const truth table entry. */
16945typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
16946
16947
16948/** The compare truth table (indexed by immediate). */
16949static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
16950{
16951 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
16952 /* 00H (EQ_OQ) */ { false, false, true, false, false },
16953 /* 01H (LT_OS) */ { true, false, false, true, false },
16954 /* 02H (LE_OS) */ { true, false, true, true, false },
16955 /* 03H (UNORD_Q) */ { false, true, false, false, false },
16956 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
16957 /* 05H (NLT_US) */ { true, true, true, false, true },
16958 /* 06H (NLE_US) */ { true, true, false, false, true },
16959 /* 07H (ORQ_Q) */ { false, false, true, true, true },
16960 /** @todo AVX variants. */
16961};
16962
16963
16964static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
16965{
16966 bool fRes;
16967 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
16968
16969 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
16970 {
16971 *pfMxcsr |= X86_MXCSR_IE;
16972 fRes = g_aCmpTbl[bEvil].fUnordered;
16973 }
16974 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
16975 {
16976 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
16977 *pfMxcsr |= X86_MXCSR_IE;
16978 fRes = g_aCmpTbl[bEvil].fUnordered;
16979 }
16980 else
16981 {
16982 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16983
16984 RTFLOAT32U r32Src1, r32Src2;
16985 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
16986 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
16987
16988 *pfMxcsr |= fDe;
16989 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16990 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16991 if (f32_eq(f32Src1, f32Src2, &SoftState))
16992 fRes = g_aCmpTbl[bEvil].fEqual;
16993 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16994 fRes = g_aCmpTbl[bEvil].fLowerThan;
16995 else
16996 fRes = g_aCmpTbl[bEvil].fGreaterThan;
16997 }
16998
16999 return fRes;
17000}
17001
17002
17003static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
17004{
17005 bool fRes;
17006 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17007
17008 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
17009 {
17010 *pfMxcsr |= X86_MXCSR_IE;
17011 fRes = g_aCmpTbl[bEvil].fUnordered;
17012 }
17013 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
17014 {
17015 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17016 *pfMxcsr |= X86_MXCSR_IE;
17017 fRes = g_aCmpTbl[bEvil].fUnordered;
17018 }
17019 else
17020 {
17021 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17022
17023 RTFLOAT64U r64Src1, r64Src2;
17024 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1);
17025 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
17026
17027 *pfMxcsr |= fDe;
17028 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17029 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17030 if (f64_eq(f64Src1, f64Src2, &SoftState))
17031 fRes = g_aCmpTbl[bEvil].fEqual;
17032 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17033 fRes = g_aCmpTbl[bEvil].fLowerThan;
17034 else
17035 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17036 }
17037
17038 return fRes;
17039}
17040
17041
17042IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17043{
17044 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17045 {
17046 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
17047 puDst->au32[i] = UINT32_MAX;
17048 else
17049 puDst->au32[i] = 0;
17050 }
17051}
17052
17053
17054IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17055{
17056 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17057 {
17058 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
17059 puDst->au64[i] = UINT64_MAX;
17060 else
17061 puDst->au64[i] = 0;
17062 }
17063}
17064
17065
17066IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17067{
17068 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
17069 puDst->au32[0] = UINT32_MAX;
17070 else
17071 puDst->au32[0] = 0;
17072
17073 puDst->au32[1] = pSrc->uSrc1.au32[1];
17074 puDst->au64[1] = pSrc->uSrc1.au64[1];
17075}
17076
17077
17078IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17079{
17080 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
17081 puDst->au64[0] = UINT64_MAX;
17082 else
17083 puDst->au64[0] = 0;
17084
17085 puDst->au64[1] = pSrc->uSrc1.au64[1];
17086}
17087#endif
17088
17089
17090/**
17091 * CVTPD2PI
17092 */
17093#ifdef IEM_WITHOUT_ASSEMBLY
17094static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17095{
17096 RTFLOAT64U r64Src;
17097 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17098
17099 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17100 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17101 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17102}
17103
17104
17105IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17106{
17107 RTUINT64U u64Res;
17108 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17109 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17110
17111 *pu64Dst = u64Res.u;
17112 *pfMxcsr = fMxcsrOut;
17113}
17114#endif
17115
17116
17117/**
17118 * CVTTPD2PI
17119 */
17120#ifdef IEM_WITHOUT_ASSEMBLY
17121static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17122{
17123 RTFLOAT64U r64Src;
17124 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17125
17126 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17127 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17128 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17129}
17130
17131
17132IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17133{
17134 RTUINT64U u64Res;
17135 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17136 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17137
17138 *pu64Dst = u64Res.u;
17139 *pfMxcsr = fMxcsrOut;
17140}
17141#endif
17142
17143
17144/**
17145 * CVTPI2PS
17146 */
17147#ifdef IEM_WITHOUT_ASSEMBLY
17148static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
17149{
17150 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17151 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
17152 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
17153}
17154
17155
17156IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17157{
17158 RTUINT64U uSrc = { u64Src };
17159 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
17160 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
17161 *pfMxcsr = fMxcsrOut;
17162}
17163#endif
17164
17165
17166/**
17167 * CVTPI2PD
17168 */
17169#ifdef IEM_WITHOUT_ASSEMBLY
17170static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
17171{
17172 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17173 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
17174 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
17175}
17176
17177
17178IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17179{
17180 RTUINT64U uSrc = { u64Src };
17181 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
17182 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
17183 *pfMxcsr = fMxcsrOut;
17184}
17185#endif
17186
17187
17188/**
17189 * CVTPS2PI
17190 */
17191#ifdef IEM_WITHOUT_ASSEMBLY
17192static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17193{
17194 RTFLOAT32U r32Src;
17195 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17196
17197 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17198 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17199 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17200}
17201
17202
17203IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17204{
17205 RTUINT64U uDst;
17206 RTUINT64U uSrc = { u64Src };
17207 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17208 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17209 *pu64Dst = uDst.u;
17210 *pfMxcsr = fMxcsrOut;
17211}
17212#endif
17213
17214
17215/**
17216 * CVTTPS2PI
17217 */
17218#ifdef IEM_WITHOUT_ASSEMBLY
17219static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17220{
17221 RTFLOAT32U r32Src;
17222 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17223
17224 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17225 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17226 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17227}
17228
17229
17230IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17231{
17232 RTUINT64U uDst;
17233 RTUINT64U uSrc = { u64Src };
17234 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17235 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17236 *pu64Dst = uDst.u;
17237 *pfMxcsr = fMxcsrOut;
17238}
17239#endif
17240
17241/**
17242 * RDRAND
17243 */
17244IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
17245{
17246 *puDst = 0;
17247 *pEFlags &= ~X86_EFL_STATUS_BITS;
17248 *pEFlags |= X86_EFL_CF;
17249}
17250
17251IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
17252{
17253 *puDst = 0;
17254 *pEFlags &= ~X86_EFL_STATUS_BITS;
17255 *pEFlags |= X86_EFL_CF;
17256}
17257
17258IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
17259{
17260 *puDst = 0;
17261 *pEFlags &= ~X86_EFL_STATUS_BITS;
17262 *pEFlags |= X86_EFL_CF;
17263}
17264
17265/**
17266 * RDSEED
17267 */
17268IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
17269{
17270 *puDst = 0;
17271 *pEFlags &= ~X86_EFL_STATUS_BITS;
17272 *pEFlags |= X86_EFL_CF;
17273}
17274
17275IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
17276{
17277 *puDst = 0;
17278 *pEFlags &= ~X86_EFL_STATUS_BITS;
17279 *pEFlags |= X86_EFL_CF;
17280}
17281
17282IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
17283{
17284 *puDst = 0;
17285 *pEFlags &= ~X86_EFL_STATUS_BITS;
17286 *pEFlags |= X86_EFL_CF;
17287}
17288
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette