VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 96945

Last change on this file since 96945 was 96930, checked in by vboxsync, 2 years ago

VMM/IEM: Implement cvtps2pi/cvttps2pi instructions, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 632.3 KB
Line 
1/* $Id: IEMAllAImplC.cpp 96930 2022-09-29 09:55:19Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123
6124
6125IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6126 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6127{
6128 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6129 AssertReleaseFailed();
6130}
6131
6132#endif /* IEM_WITHOUT_ASSEMBLY */
6133
6134IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6135 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6136{
6137 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6138}
6139
6140IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6141 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6142{
6143 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6144}
6145
6146
6147#if defined(IEM_WITHOUT_ASSEMBLY)
6148IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6149{
6150 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6151 AssertReleaseFailed();
6152}
6153#endif /* IEM_WITHOUT_ASSEMBLY */
6154
6155IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6156{
6157 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6158}
6159
6160IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6161{
6162 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6163}
6164
6165
6166#ifdef IEM_WITHOUT_ASSEMBLY
6167
6168static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6169{
6170 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6171 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6172 extFloat80_t v;
6173 (void)fFcw;
6174
6175 v = extF80_sin(x, &SoftState);
6176
6177 iemFpuSoftF80ToIprt(pr80Result, v);
6178
6179 return fFsw;
6180}
6181
6182IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6183{
6184 uint16_t const fFcw = pFpuState->FCW;
6185 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6186
6187 if (RTFLOAT80U_IS_ZERO(pr80Val))
6188 {
6189 pFpuRes->r80Result = *pr80Val;
6190 }
6191 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6192 {
6193 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6194 {
6195 fFsw |= X86_FSW_C2;
6196 pFpuRes->r80Result = *pr80Val;
6197 }
6198 else
6199 {
6200 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6201 {
6202 pFpuRes->r80Result = *pr80Val;
6203
6204 }
6205 else
6206 {
6207 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6208 }
6209 fFsw |= X86_FSW_PE;
6210 if (!(fFcw & X86_FCW_PM))
6211 fFsw |= X86_FSW_ES | X86_FSW_B;
6212 }
6213 }
6214 else if (RTFLOAT80U_IS_INF(pr80Val))
6215 {
6216 fFsw |= X86_FSW_IE;
6217 if (!(fFcw & X86_FCW_IM))
6218 {
6219 fFsw |= X86_FSW_ES | X86_FSW_B;
6220 pFpuRes->r80Result = *pr80Val;
6221 }
6222 else
6223 {
6224 pFpuRes->r80Result = g_r80Indefinite;
6225 }
6226 }
6227 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6228 {
6229 pFpuRes->r80Result = *pr80Val;
6230 fFsw |= X86_FSW_DE;
6231
6232 if (fFcw & X86_FCW_DM)
6233 {
6234 fFsw |= X86_FSW_UE | X86_FSW_PE;
6235
6236 if (!(fFcw & X86_FCW_UM) || !(fFcw & X86_FCW_PM))
6237 {
6238 fFsw |= X86_FSW_ES | X86_FSW_B;
6239 }
6240 }
6241 else
6242 {
6243 fFsw |= X86_FSW_ES | X86_FSW_B;
6244 }
6245 }
6246 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6247 {
6248 pFpuRes->r80Result = *pr80Val;
6249 fFsw |= X86_FSW_DE;
6250
6251 if (fFcw & X86_FCW_DM)
6252 {
6253 if (fFcw & X86_FCW_PM)
6254 {
6255 fFsw |= X86_FSW_PE;
6256 }
6257 else
6258 {
6259 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6260 }
6261
6262 pFpuRes->r80Result.sj64.uExponent = 1;
6263 }
6264 else
6265 {
6266 fFsw |= X86_FSW_ES | X86_FSW_B;
6267 }
6268 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6269 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6270 {
6271 pFpuRes->r80Result = *pr80Val;
6272 } else {
6273 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6274 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6275 && (fFcw & X86_FCW_IM))
6276 pFpuRes->r80Result = g_r80Indefinite;
6277 else
6278 {
6279 pFpuRes->r80Result = *pr80Val;
6280 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6281 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6282 }
6283
6284 fFsw |= X86_FSW_IE;
6285 if (!(fFcw & X86_FCW_IM))
6286 fFsw |= X86_FSW_ES | X86_FSW_B;
6287 }
6288
6289 pFpuRes->FSW = fFsw;
6290}
6291#endif /* IEM_WITHOUT_ASSEMBLY */
6292
6293IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6294{
6295 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6296}
6297
6298IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6299{
6300 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6301}
6302
6303#ifdef IEM_WITHOUT_ASSEMBLY
6304
6305static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6306{
6307 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6308 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6309 extFloat80_t v;
6310 (void)fFcw;
6311
6312 v = extF80_cos(x, &SoftState);
6313
6314 iemFpuSoftF80ToIprt(pr80Result, v);
6315
6316 return fFsw;
6317}
6318
6319IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6320{
6321 uint16_t const fFcw = pFpuState->FCW;
6322 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6323
6324 if (RTFLOAT80U_IS_ZERO(pr80Val))
6325 {
6326 pFpuRes->r80Result = g_ar80One[0];
6327 }
6328 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6329 {
6330 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6331 {
6332 fFsw |= X86_FSW_C2;
6333 pFpuRes->r80Result = *pr80Val;
6334 }
6335 else
6336 {
6337 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6338 {
6339 pFpuRes->r80Result = g_ar80One[0];
6340
6341 }
6342 else
6343 {
6344 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6345 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6346 }
6347 fFsw |= X86_FSW_PE;
6348 if (!(fFcw & X86_FCW_PM))
6349 fFsw |= X86_FSW_ES | X86_FSW_B;
6350 }
6351 }
6352 else if (RTFLOAT80U_IS_INF(pr80Val))
6353 {
6354 fFsw |= X86_FSW_IE;
6355 if (!(fFcw & X86_FCW_IM))
6356 {
6357 fFsw |= X86_FSW_ES | X86_FSW_B;
6358 pFpuRes->r80Result = *pr80Val;
6359 }
6360 else
6361 {
6362 pFpuRes->r80Result = g_r80Indefinite;
6363 }
6364 }
6365 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6366 {
6367 fFsw |= X86_FSW_DE;
6368
6369 if (fFcw & X86_FCW_DM)
6370 {
6371 pFpuRes->r80Result = g_ar80One[0];
6372
6373 if (fFcw & X86_FCW_PM)
6374 {
6375 fFsw |= X86_FSW_PE;
6376 }
6377 else
6378 {
6379 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6380 }
6381 }
6382 else
6383 {
6384 pFpuRes->r80Result = *pr80Val;
6385 fFsw |= X86_FSW_ES | X86_FSW_B;
6386 }
6387 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6388 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6389 {
6390 pFpuRes->r80Result = *pr80Val;
6391 } else {
6392 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6393 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6394 && (fFcw & X86_FCW_IM))
6395 pFpuRes->r80Result = g_r80Indefinite;
6396 else
6397 {
6398 pFpuRes->r80Result = *pr80Val;
6399 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6400 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6401 }
6402
6403 fFsw |= X86_FSW_IE;
6404 if (!(fFcw & X86_FCW_IM))
6405 fFsw |= X86_FSW_ES | X86_FSW_B;
6406 }
6407
6408 pFpuRes->FSW = fFsw;
6409}
6410#endif /* IEM_WITHOUT_ASSEMBLY */
6411
6412IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6413{
6414 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6415}
6416
6417IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6418{
6419 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6420}
6421
6422#ifdef IEM_WITHOUT_ASSEMBLY
6423
6424static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6425{
6426 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6427 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6428 extFloat80_t r80Sin, r80Cos;
6429 (void)fFcw;
6430
6431 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6432
6433 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6434 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6435
6436 return fFsw;
6437}
6438
6439IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6440{
6441 uint16_t const fFcw = pFpuState->FCW;
6442 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6443
6444 if (RTFLOAT80U_IS_ZERO(pr80Val))
6445 {
6446 pFpuResTwo->r80Result1 = *pr80Val;
6447 pFpuResTwo->r80Result2 = g_ar80One[0];
6448 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6449 }
6450 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6451 {
6452 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6453 {
6454 fFsw |= X86_FSW_C2;
6455
6456 if (fFcw & X86_FCW_IM)
6457 {
6458 pFpuResTwo->r80Result1 = g_r80Indefinite;
6459 }
6460 else
6461 {
6462 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6463 }
6464
6465 pFpuResTwo->r80Result2 = *pr80Val;
6466 }
6467 else
6468 {
6469 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6470
6471 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6472 {
6473 pFpuResTwo->r80Result1 = *pr80Val;
6474 pFpuResTwo->r80Result2 = g_ar80One[0];
6475 }
6476 else
6477 {
6478 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6479 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6480 }
6481 fFsw |= X86_FSW_PE;
6482 if (!(fFcw & X86_FCW_PM))
6483 fFsw |= X86_FSW_ES | X86_FSW_B;
6484 }
6485 }
6486 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6487 {
6488 fFsw |= X86_FSW_DE;
6489
6490 if (fFcw & X86_FCW_DM)
6491 {
6492 pFpuResTwo->r80Result1 = *pr80Val;
6493 pFpuResTwo->r80Result2 = g_ar80One[0];
6494 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6495
6496 if (fFcw & X86_FCW_PM)
6497 {
6498 fFsw |= X86_FSW_PE;
6499 }
6500 else
6501 {
6502 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6503 }
6504
6505 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6506 }
6507 else
6508 {
6509 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6510 pFpuResTwo->r80Result2 = *pr80Val;
6511 fFsw |= X86_FSW_ES | X86_FSW_B;
6512 }
6513 }
6514 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6515 {
6516 fFsw |= X86_FSW_DE;
6517
6518 if (fFcw & X86_FCW_DM)
6519 {
6520 pFpuResTwo->r80Result1 = *pr80Val;
6521 pFpuResTwo->r80Result2 = g_ar80One[0];
6522
6523 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6524 fFsw |= X86_FSW_UE | X86_FSW_PE;
6525
6526 if (fFcw & X86_FCW_PM)
6527 {
6528 if (!(fFcw & X86_FCW_UM))
6529 fFsw |= X86_FSW_ES | X86_FSW_B;
6530 }
6531 else
6532 {
6533 fFsw |= X86_FSW_ES | X86_FSW_B;
6534 }
6535 }
6536 else
6537 {
6538 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6539 pFpuResTwo->r80Result2 = *pr80Val;
6540 fFsw |= X86_FSW_ES | X86_FSW_B;
6541 }
6542 }
6543 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6544 {
6545 pFpuResTwo->r80Result1 = *pr80Val;
6546 pFpuResTwo->r80Result2 = *pr80Val;
6547 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6548 }
6549 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6550 {
6551 if (fFcw & X86_FCW_IM)
6552 {
6553 pFpuResTwo->r80Result1 = g_r80Indefinite;
6554 pFpuResTwo->r80Result2 = g_r80Indefinite;
6555 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6556 }
6557 else
6558 {
6559 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6560 pFpuResTwo->r80Result2 = *pr80Val;
6561 }
6562
6563 fFsw |= X86_FSW_IE;
6564 if (!(fFcw & X86_FCW_IM))
6565 fFsw |= X86_FSW_ES | X86_FSW_B;
6566 }
6567 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6568 {
6569 pFpuResTwo->r80Result1 = *pr80Val;
6570 pFpuResTwo->r80Result2 = *pr80Val;
6571
6572 if (fFcw & X86_FCW_IM)
6573 {
6574 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6575 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6576 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6577 }
6578 else
6579 {
6580 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6581 pFpuResTwo->r80Result2 = *pr80Val;
6582 }
6583
6584 fFsw |= X86_FSW_IE;
6585 if (!(fFcw & X86_FCW_IM))
6586 fFsw |= X86_FSW_ES | X86_FSW_B;
6587 }
6588 else if (RTFLOAT80U_IS_INF(pr80Val))
6589 {
6590 if (fFcw & X86_FCW_IM)
6591 {
6592 pFpuResTwo->r80Result1 = g_r80Indefinite;
6593 pFpuResTwo->r80Result2 = g_r80Indefinite;
6594 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6595 }
6596 else
6597 {
6598 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6599 pFpuResTwo->r80Result2 = *pr80Val;
6600 }
6601
6602 fFsw |= X86_FSW_IE;
6603 if (!(fFcw & X86_FCW_IM))
6604 fFsw |= X86_FSW_ES | X86_FSW_B;
6605 }
6606
6607 pFpuResTwo->FSW = fFsw;
6608}
6609#endif /* IEM_WITHOUT_ASSEMBLY */
6610
6611IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6612{
6613 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6614}
6615
6616IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6617{
6618 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6619}
6620
6621#ifdef IEM_WITHOUT_ASSEMBLY
6622
6623
6624/*********************************************************************************************************************************
6625* x87 FPU Compare and Testing Operations *
6626*********************************************************************************************************************************/
6627
6628IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6629{
6630 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6631
6632 if (RTFLOAT80U_IS_ZERO(pr80Val))
6633 fFsw |= X86_FSW_C3;
6634 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6635 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6636 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6637 {
6638 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6639 if (!(pFpuState->FCW & X86_FCW_DM))
6640 fFsw |= X86_FSW_ES | X86_FSW_B;
6641 }
6642 else
6643 {
6644 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6645 if (!(pFpuState->FCW & X86_FCW_IM))
6646 fFsw |= X86_FSW_ES | X86_FSW_B;
6647 }
6648
6649 *pu16Fsw = fFsw;
6650}
6651
6652
6653IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6654{
6655 RT_NOREF(pFpuState);
6656 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6657
6658 /* C1 = sign bit (always, even if empty Intel says). */
6659 if (pr80Val->s.fSign)
6660 fFsw |= X86_FSW_C1;
6661
6662 /* Classify the value in C0, C2, C3. */
6663 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6664 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6665 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6666 fFsw |= X86_FSW_C2;
6667 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6668 fFsw |= X86_FSW_C3;
6669 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6670 fFsw |= X86_FSW_C0;
6671 else if (RTFLOAT80U_IS_INF(pr80Val))
6672 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6673 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6674 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6675 /* whatever else: 0 */
6676
6677 *pu16Fsw = fFsw;
6678}
6679
6680
6681/**
6682 * Worker for fcom, fucom, and friends.
6683 */
6684static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6685 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6686{
6687 /*
6688 * Unpack the values.
6689 */
6690 bool const fSign1 = pr80Val1->s.fSign;
6691 int32_t iExponent1 = pr80Val1->s.uExponent;
6692 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6693
6694 bool const fSign2 = pr80Val2->s.fSign;
6695 int32_t iExponent2 = pr80Val2->s.uExponent;
6696 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6697
6698 /*
6699 * Check for invalid inputs.
6700 */
6701 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6702 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6703 {
6704 if (!(fFcw & X86_FCW_IM))
6705 fFsw |= X86_FSW_ES | X86_FSW_B;
6706 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6707 }
6708
6709 /*
6710 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6711 */
6712 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6713 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6714 {
6715 if ( fIeOnAllNaNs
6716 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6717 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6718 {
6719 fFsw |= X86_FSW_IE;
6720 if (!(fFcw & X86_FCW_IM))
6721 fFsw |= X86_FSW_ES | X86_FSW_B;
6722 }
6723 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6724 }
6725
6726 /*
6727 * Normalize the values.
6728 */
6729 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6730 {
6731 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6732 iExponent1 = 1;
6733 else
6734 {
6735 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6736 uMantissa1 <<= iExponent1;
6737 iExponent1 = 1 - iExponent1;
6738 }
6739 fFsw |= X86_FSW_DE;
6740 if (!(fFcw & X86_FCW_DM))
6741 fFsw |= X86_FSW_ES | X86_FSW_B;
6742 }
6743
6744 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6745 {
6746 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6747 iExponent2 = 1;
6748 else
6749 {
6750 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6751 uMantissa2 <<= iExponent2;
6752 iExponent2 = 1 - iExponent2;
6753 }
6754 fFsw |= X86_FSW_DE;
6755 if (!(fFcw & X86_FCW_DM))
6756 fFsw |= X86_FSW_ES | X86_FSW_B;
6757 }
6758
6759 /*
6760 * Test if equal (val1 == val2):
6761 */
6762 if ( uMantissa1 == uMantissa2
6763 && iExponent1 == iExponent2
6764 && ( fSign1 == fSign2
6765 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6766 fFsw |= X86_FSW_C3;
6767 /*
6768 * Test if less than (val1 < val2):
6769 */
6770 else if (fSign1 && !fSign2)
6771 fFsw |= X86_FSW_C0;
6772 else if (fSign1 == fSign2)
6773 {
6774 /* Zeros are problematic, however at the most one can be zero here. */
6775 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6776 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6777 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6778 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6779
6780 if ( fSign1
6781 ^ ( iExponent1 < iExponent2
6782 || ( iExponent1 == iExponent2
6783 && uMantissa1 < uMantissa2 ) ) )
6784 fFsw |= X86_FSW_C0;
6785 }
6786 /* else: No flags set if greater. */
6787
6788 return fFsw;
6789}
6790
6791
6792IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6793 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6794{
6795 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6796}
6797
6798
6799
6800
6801IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6802 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6803{
6804 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6805}
6806
6807
6808IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6809 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6810{
6811 RTFLOAT80U r80Val2;
6812 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6813 Assert(!fFsw || fFsw == X86_FSW_DE);
6814 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6815 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6816 {
6817 if (!(pFpuState->FCW & X86_FCW_DM))
6818 fFsw |= X86_FSW_ES | X86_FSW_B;
6819 *pfFsw |= fFsw;
6820 }
6821}
6822
6823
6824IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6825 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6826{
6827 RTFLOAT80U r80Val2;
6828 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6829 Assert(!fFsw || fFsw == X86_FSW_DE);
6830 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6831 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6832 {
6833 if (!(pFpuState->FCW & X86_FCW_DM))
6834 fFsw |= X86_FSW_ES | X86_FSW_B;
6835 *pfFsw |= fFsw;
6836 }
6837}
6838
6839
6840IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6841 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6842{
6843 RTFLOAT80U r80Val2;
6844 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6845 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6846}
6847
6848
6849IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6850 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6851{
6852 RTFLOAT80U r80Val2;
6853 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6854 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6855}
6856
6857
6858/**
6859 * Worker for fcomi & fucomi.
6860 */
6861static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6862 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6863{
6864 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6865 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6866 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6867 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6868
6869 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6870 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6871 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6872}
6873
6874
6875IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6876 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6877{
6878 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6879}
6880
6881
6882IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6883 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6884{
6885 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6886}
6887
6888
6889/*********************************************************************************************************************************
6890* x87 FPU Other Operations *
6891*********************************************************************************************************************************/
6892
6893/**
6894 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6895 */
6896static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6897{
6898 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6899 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6900 true /*exact / generate #PE */, &SoftState));
6901 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6902}
6903
6904
6905IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6906{
6907 uint16_t const fFcw = pFpuState->FCW;
6908 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6909
6910 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6911 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6912 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6913 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6914 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6915 || RTFLOAT80U_IS_INF(pr80Val))
6916 pFpuRes->r80Result = *pr80Val;
6917 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6918 {
6919 fFsw |= X86_FSW_DE;
6920 if (fFcw & X86_FCW_DM)
6921 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6922 else
6923 {
6924 pFpuRes->r80Result = *pr80Val;
6925 fFsw |= X86_FSW_ES | X86_FSW_B;
6926 }
6927 }
6928 else
6929 {
6930 if (fFcw & X86_FCW_IM)
6931 {
6932 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6933 pFpuRes->r80Result = g_r80Indefinite;
6934 else
6935 {
6936 pFpuRes->r80Result = *pr80Val;
6937 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6938 }
6939 }
6940 else
6941 {
6942 pFpuRes->r80Result = *pr80Val;
6943 fFsw |= X86_FSW_ES | X86_FSW_B;
6944 }
6945 fFsw |= X86_FSW_IE;
6946 }
6947 pFpuRes->FSW = fFsw;
6948}
6949
6950
6951IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6952 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6953{
6954 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6955 it does everything we need it to do. */
6956 uint16_t const fFcw = pFpuState->FCW;
6957 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6958 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6959 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6960 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6961}
6962
6963
6964/**
6965 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6966 */
6967static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6968{
6969 Assert(!pr80Val->s.fSign);
6970 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6971 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6972 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6973}
6974
6975
6976IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6977{
6978 uint16_t const fFcw = pFpuState->FCW;
6979 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6980
6981 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6982 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6983 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6984 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6985 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6986 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6987 pFpuRes->r80Result = *pr80Val;
6988 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6989 {
6990 fFsw |= X86_FSW_DE;
6991 if (fFcw & X86_FCW_DM)
6992 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6993 else
6994 {
6995 pFpuRes->r80Result = *pr80Val;
6996 fFsw |= X86_FSW_ES | X86_FSW_B;
6997 }
6998 }
6999 else
7000 {
7001 if (fFcw & X86_FCW_IM)
7002 {
7003 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7004 pFpuRes->r80Result = g_r80Indefinite;
7005 else
7006 {
7007 pFpuRes->r80Result = *pr80Val;
7008 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7009 }
7010 }
7011 else
7012 {
7013 pFpuRes->r80Result = *pr80Val;
7014 fFsw |= X86_FSW_ES | X86_FSW_B;
7015 }
7016 fFsw |= X86_FSW_IE;
7017 }
7018 pFpuRes->FSW = fFsw;
7019}
7020
7021
7022/**
7023 * @code{.unparsed}
7024 * x x * ln2
7025 * f(x) = 2 - 1 = e - 1
7026 *
7027 * @endcode
7028 *
7029 * We can approximate e^x by a Taylor/Maclaurin series (see
7030 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7031 * @code{.unparsed}
7032 * n 0 1 2 3 4
7033 * inf x x x x x x
7034 * SUM ----- = --- + --- + --- + --- + --- + ...
7035 * n=0 n! 0! 1! 2! 3! 4!
7036 *
7037 * 2 3 4
7038 * x x x
7039 * = 1 + x + --- + --- + --- + ...
7040 * 2! 3! 4!
7041 * @endcode
7042 *
7043 * Given z = x * ln2, we get:
7044 * @code{.unparsed}
7045 * 2 3 4 n
7046 * z z z z z
7047 * e - 1 = z + --- + --- + --- + ... + ---
7048 * 2! 3! 4! n!
7049 * @endcode
7050 *
7051 * Wanting to use Horner's method, we move one z outside and get:
7052 * @code{.unparsed}
7053 * 2 3 (n-1)
7054 * z z z z
7055 * = z ( 1 + --- + --- + --- + ... + ------- )
7056 * 2! 3! 4! n!
7057 * @endcode
7058 *
7059 * The constants we need for using Horner's methods are 1 and 1 / n!.
7060 *
7061 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7062 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7063 * and can approximate it to be 1.0. For a visual demonstration of this
7064 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7065 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7066 *
7067 *
7068 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7069 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7070 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7071 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7072 * blocks). (The one bit difference is probably an implicit one missing from
7073 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7074 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7075 * exponent.
7076 *
7077 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7078 * successfully reproduced the exact results from an Intel 10980XE, there is
7079 * always a portition of rounding differences. Not going to spend too much time
7080 * on getting this 100% the same, at least not now.
7081 *
7082 * P.S. If someone are really curious about 8087 and its contstants:
7083 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7084 *
7085 *
7086 * @param pr80Val The exponent value (x), less than 1.0, greater than
7087 * -1.0 and not zero. This can be a normal, denormal
7088 * or pseudo-denormal value.
7089 * @param pr80Result Where to return the result.
7090 * @param fFcw FPU control word.
7091 * @param fFsw FPU status word.
7092 */
7093static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7094{
7095 /* As mentioned above, we can skip the expensive polynomial calculation
7096 as it will be close enough to 1.0 that it makes no difference.
7097
7098 The cutoff point for intel 10980XE is exponents >= -69. Intel
7099 also seems to be using a 67-bit or 68-bit constant value, and we get
7100 a smattering of rounding differences if we go for higher precision. */
7101 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7102 {
7103 RTUINT256U u256;
7104 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7105 u256.QWords.qw0 |= 1; /* force #PE */
7106 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7107 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7108 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7109 : 1 - RTFLOAT80U_EXP_BIAS,
7110 fFcw, fFsw);
7111 }
7112 else
7113 {
7114#ifdef IEM_WITH_FLOAT128_FOR_FPU
7115 /* This approach is not good enough for small values, we end up with zero. */
7116 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7117 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7118 _Float128 rd128Result = powf128(2.0L, rd128Val);
7119 rd128Result -= 1.0L;
7120 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7121 iemFpuF128RestoreRounding(fOldRounding);
7122
7123# else
7124 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7125 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7126
7127 /* As mentioned above, enforce 68-bit internal mantissa width to better
7128 match the Intel 10980XE results. */
7129 unsigned const cPrecision = 68;
7130
7131 /* first calculate z = x * ln2 */
7132 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7133 cPrecision);
7134
7135 /* Then do the polynomial evaluation. */
7136 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7137 cPrecision, &SoftState);
7138 r = f128_mul(z, r, &SoftState);
7139
7140 /* Output the result. */
7141 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7142# endif
7143 }
7144 return fFsw;
7145}
7146
7147
7148IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7149{
7150 uint16_t const fFcw = pFpuState->FCW;
7151 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7152
7153 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7154 {
7155 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7156 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7157 else
7158 {
7159 /* Special case:
7160 2^+1.0 - 1.0 = 1.0
7161 2^-1.0 - 1.0 = -0.5 */
7162 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7163 && pr80Val->s.uMantissa == RT_BIT_64(63))
7164 {
7165 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7166 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7167 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7168 }
7169 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7170 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7171 else
7172 pFpuRes->r80Result = *pr80Val;
7173 fFsw |= X86_FSW_PE;
7174 if (!(fFcw & X86_FCW_PM))
7175 fFsw |= X86_FSW_ES | X86_FSW_B;
7176 }
7177 }
7178 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7179 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7180 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7181 pFpuRes->r80Result = *pr80Val;
7182 else if (RTFLOAT80U_IS_INF(pr80Val))
7183 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7184 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7185 {
7186 fFsw |= X86_FSW_DE;
7187 if (fFcw & X86_FCW_DM)
7188 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7189 else
7190 {
7191 pFpuRes->r80Result = *pr80Val;
7192 fFsw |= X86_FSW_ES | X86_FSW_B;
7193 }
7194 }
7195 else
7196 {
7197 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7198 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7199 && (fFcw & X86_FCW_IM))
7200 pFpuRes->r80Result = g_r80Indefinite;
7201 else
7202 {
7203 pFpuRes->r80Result = *pr80Val;
7204 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7205 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7206 }
7207 fFsw |= X86_FSW_IE;
7208 if (!(fFcw & X86_FCW_IM))
7209 fFsw |= X86_FSW_ES | X86_FSW_B;
7210 }
7211 pFpuRes->FSW = fFsw;
7212}
7213
7214#endif /* IEM_WITHOUT_ASSEMBLY */
7215
7216IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7217{
7218 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7219}
7220
7221IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7222{
7223 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7224}
7225
7226#ifdef IEM_WITHOUT_ASSEMBLY
7227
7228IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7229{
7230 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7231 pFpuRes->r80Result = *pr80Val;
7232 pFpuRes->r80Result.s.fSign = 0;
7233}
7234
7235
7236IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7237{
7238 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7239 pFpuRes->r80Result = *pr80Val;
7240 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7241}
7242
7243
7244IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7245{
7246 uint16_t const fFcw = pFpuState->FCW;
7247 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7248
7249 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7250 {
7251 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7252 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7253
7254 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7255 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7256 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7257 }
7258 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7259 {
7260 fFsw |= X86_FSW_ZE;
7261 if (fFcw & X86_FCW_ZM)
7262 {
7263 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7264 pFpuResTwo->r80Result2 = *pr80Val;
7265 }
7266 else
7267 {
7268 pFpuResTwo->r80Result2 = *pr80Val;
7269 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7270 }
7271 }
7272 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7273 {
7274 fFsw |= X86_FSW_DE;
7275 if (fFcw & X86_FCW_DM)
7276 {
7277 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7278 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7279 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7280 int32_t iExponent = -16382;
7281 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7282 {
7283 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7284 iExponent--;
7285 }
7286
7287 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7288 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7289 }
7290 else
7291 {
7292 pFpuResTwo->r80Result2 = *pr80Val;
7293 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7294 }
7295 }
7296 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7297 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7298 {
7299 pFpuResTwo->r80Result1 = *pr80Val;
7300 pFpuResTwo->r80Result2 = *pr80Val;
7301 }
7302 else if (RTFLOAT80U_IS_INF(pr80Val))
7303 {
7304 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7305 pFpuResTwo->r80Result2 = *pr80Val;
7306 }
7307 else
7308 {
7309 if (fFcw & X86_FCW_IM)
7310 {
7311 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7312 pFpuResTwo->r80Result1 = g_r80Indefinite;
7313 else
7314 {
7315 pFpuResTwo->r80Result1 = *pr80Val;
7316 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7317 }
7318 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7319 }
7320 else
7321 {
7322 pFpuResTwo->r80Result2 = *pr80Val;
7323 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7324 }
7325 fFsw |= X86_FSW_IE;
7326 }
7327 pFpuResTwo->FSW = fFsw;
7328}
7329#endif /* IEM_WITHOUT_ASSEMBLY */
7330
7331#if defined(IEM_WITHOUT_ASSEMBLY)
7332
7333static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7334{
7335 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7336 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7337 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7338 extFloat80_t v;
7339 (void)fFcw;
7340
7341 v = extF80_ylog2x(y, x, &SoftState);
7342 iemFpuSoftF80ToIprt(pr80Result, v);
7343
7344 return fFsw;
7345}
7346
7347IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7348 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7349{
7350 uint16_t const fFcw = pFpuState->FCW;
7351 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7352
7353 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7354 {
7355 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7356
7357 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7358 if (!(fFcw & X86_FCW_PM))
7359 fFsw |= X86_FSW_ES | X86_FSW_B;
7360 }
7361 else
7362 {
7363 fFsw |= X86_FSW_IE;
7364
7365 if (!(fFcw & X86_FCW_IM))
7366 {
7367 pFpuRes->r80Result = *pr80Val2;
7368 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7369 }
7370 else
7371 {
7372 pFpuRes->r80Result = g_r80Indefinite;
7373 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7374 }
7375 }
7376
7377 pFpuRes->FSW = fFsw;
7378}
7379#endif /* IEM_WITHOUT_ASSEMBLY */
7380
7381IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7382 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7383{
7384 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7385}
7386
7387IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7388 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7389{
7390 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7391}
7392
7393#if defined(IEM_WITHOUT_ASSEMBLY)
7394
7395static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7396{
7397 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7398 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7399 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7400 extFloat80_t v;
7401 (void)fFcw;
7402
7403 v = extF80_ylog2xp1(y, x, &SoftState);
7404 iemFpuSoftF80ToIprt(pr80Result, v);
7405
7406 return fFsw;
7407}
7408
7409IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7410 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7411{
7412 uint16_t const fFcw = pFpuState->FCW;
7413 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7414
7415 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7416 {
7417 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7418
7419 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7420 if (!(fFcw & X86_FCW_PM))
7421 fFsw |= X86_FSW_ES | X86_FSW_B;
7422 }
7423 else
7424 {
7425 fFsw |= X86_FSW_IE;
7426
7427 if (!(fFcw & X86_FCW_IM))
7428 {
7429 pFpuRes->r80Result = *pr80Val2;
7430 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7431 }
7432 else
7433 {
7434 pFpuRes->r80Result = g_r80Indefinite;
7435 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7436 }
7437 }
7438
7439 pFpuRes->FSW = fFsw;
7440}
7441
7442#endif /* IEM_WITHOUT_ASSEMBLY */
7443
7444IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7445 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7446{
7447 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7448}
7449
7450IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7451 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7452{
7453 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7454}
7455
7456
7457/*********************************************************************************************************************************
7458* MMX, SSE & AVX *
7459*********************************************************************************************************************************/
7460
7461/*
7462 * MOVSLDUP / VMOVSLDUP
7463 */
7464IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7465{
7466 puDst->au32[0] = puSrc->au32[0];
7467 puDst->au32[1] = puSrc->au32[0];
7468 puDst->au32[2] = puSrc->au32[2];
7469 puDst->au32[3] = puSrc->au32[2];
7470}
7471
7472#ifdef IEM_WITH_VEX
7473
7474IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7475{
7476 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7477 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7478 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7479 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7480 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7481 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7482 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7483 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7484}
7485
7486
7487IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7488{
7489 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7490 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7491 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7492 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7493 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7494 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7495 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7496 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7497}
7498
7499#endif /* IEM_WITH_VEX */
7500
7501
7502/*
7503 * MOVSHDUP / VMOVSHDUP
7504 */
7505IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7506{
7507 puDst->au32[0] = puSrc->au32[1];
7508 puDst->au32[1] = puSrc->au32[1];
7509 puDst->au32[2] = puSrc->au32[3];
7510 puDst->au32[3] = puSrc->au32[3];
7511}
7512
7513#ifdef IEM_WITH_VEX
7514
7515IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7516{
7517 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7518 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7519 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7520 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7521 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7522 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7523 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7524 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7525}
7526
7527
7528IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7529{
7530 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7531 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7532 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7533 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7534 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7535 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7536 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7537 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7538}
7539
7540#endif /* IEM_WITH_VEX */
7541
7542
7543/*
7544 * MOVDDUP / VMOVDDUP
7545 */
7546IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7547{
7548 puDst->au64[0] = uSrc;
7549 puDst->au64[1] = uSrc;
7550}
7551
7552#ifdef IEM_WITH_VEX
7553
7554IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7555{
7556 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7557 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7558 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7559 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7560}
7561
7562IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7563{
7564 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7565 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7566 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7567 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7568}
7569
7570#endif /* IEM_WITH_VEX */
7571
7572
7573/*
7574 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7575 */
7576#ifdef IEM_WITHOUT_ASSEMBLY
7577
7578IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7579{
7580 RT_NOREF(pFpuState);
7581 *puDst &= *puSrc;
7582}
7583
7584
7585IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7586{
7587 RT_NOREF(pFpuState);
7588 puDst->au64[0] &= puSrc->au64[0];
7589 puDst->au64[1] &= puSrc->au64[1];
7590}
7591
7592#endif
7593
7594IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7595 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7596{
7597 RT_NOREF(pExtState);
7598 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7599 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7600}
7601
7602
7603IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7604 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7605{
7606 RT_NOREF(pExtState);
7607 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7608 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7609 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7610 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7611}
7612
7613
7614/*
7615 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7616 */
7617#ifdef IEM_WITHOUT_ASSEMBLY
7618
7619IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7620{
7621 RT_NOREF(pFpuState);
7622 *puDst = ~*puDst & *puSrc;
7623}
7624
7625
7626IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7627{
7628 RT_NOREF(pFpuState);
7629 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7630 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7631}
7632
7633#endif
7634
7635IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7636 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7637{
7638 RT_NOREF(pExtState);
7639 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7640 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7641}
7642
7643
7644IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7645 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7646{
7647 RT_NOREF(pExtState);
7648 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7649 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7650 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7651 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7652}
7653
7654
7655/*
7656 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7657 */
7658#ifdef IEM_WITHOUT_ASSEMBLY
7659
7660IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7661{
7662 RT_NOREF(pFpuState);
7663 *puDst |= *puSrc;
7664}
7665
7666
7667IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7668{
7669 RT_NOREF(pFpuState);
7670 puDst->au64[0] |= puSrc->au64[0];
7671 puDst->au64[1] |= puSrc->au64[1];
7672}
7673
7674#endif
7675
7676IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7677 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7678{
7679 RT_NOREF(pExtState);
7680 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7681 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7682}
7683
7684
7685IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7686 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7687{
7688 RT_NOREF(pExtState);
7689 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7690 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7691 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7692 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7693}
7694
7695
7696/*
7697 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7698 */
7699#ifdef IEM_WITHOUT_ASSEMBLY
7700
7701IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7702{
7703 RT_NOREF(pFpuState);
7704 *puDst ^= *puSrc;
7705}
7706
7707
7708IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7709{
7710 RT_NOREF(pFpuState);
7711 puDst->au64[0] ^= puSrc->au64[0];
7712 puDst->au64[1] ^= puSrc->au64[1];
7713}
7714
7715#endif
7716
7717IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7718 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7719{
7720 RT_NOREF(pExtState);
7721 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7722 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7723}
7724
7725
7726IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7727 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7728{
7729 RT_NOREF(pExtState);
7730 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7731 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7732 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7733 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7734}
7735
7736
7737/*
7738 * PCMPEQB / VPCMPEQB
7739 */
7740#ifdef IEM_WITHOUT_ASSEMBLY
7741
7742IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7743{
7744 RT_NOREF(pFpuState);
7745 RTUINT64U uSrc1 = { *puDst };
7746 RTUINT64U uSrc2 = { *puSrc };
7747 RTUINT64U uDst;
7748 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7749 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7750 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7751 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7752 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7753 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7754 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7755 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7756 *puDst = uDst.u;
7757}
7758
7759
7760IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7761{
7762 RT_NOREF(pFpuState);
7763 RTUINT128U uSrc1 = *puDst;
7764 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7765 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7766 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7767 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7768 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7769 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7770 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7771 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7772 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7773 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7774 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7775 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7776 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7777 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7778 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7779 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7780}
7781
7782#endif
7783
7784IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7785 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7786{
7787 RT_NOREF(pExtState);
7788 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7789 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7790 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7791 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7792 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7793 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7794 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7795 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7796 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7797 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7798 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7799 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7800 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7801 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7802 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7803 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7804}
7805
7806IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7807 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7808{
7809 RT_NOREF(pExtState);
7810 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7811 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7812 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7813 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7814 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7815 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7816 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7817 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7818 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7819 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7820 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7821 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7822 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7823 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7824 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7825 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7826 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7827 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7828 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7829 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7830 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7831 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7832 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7833 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7834 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7835 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7836 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7837 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7838 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7839 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7840 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7841 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7842}
7843
7844
7845/*
7846 * PCMPEQW / VPCMPEQW
7847 */
7848#ifdef IEM_WITHOUT_ASSEMBLY
7849
7850IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7851{
7852 RT_NOREF(pFpuState);
7853 RTUINT64U uSrc1 = { *puDst };
7854 RTUINT64U uSrc2 = { *puSrc };
7855 RTUINT64U uDst;
7856 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7857 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7858 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7859 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7860 *puDst = uDst.u;
7861}
7862
7863
7864IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7865{
7866 RT_NOREF(pFpuState);
7867 RTUINT128U uSrc1 = *puDst;
7868 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7869 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7870 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7871 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7872 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7873 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7874 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7875 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7876}
7877
7878#endif
7879
7880IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7881 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7882{
7883 RT_NOREF(pExtState);
7884 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7885 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7886 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7887 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7888 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7889 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7890 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7891 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7892}
7893
7894IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7895 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7896{
7897 RT_NOREF(pExtState);
7898 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7899 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7900 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7901 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7902 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7903 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7904 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7905 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7906 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7907 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7908 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7909 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7910 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7911 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7912 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7913 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7914}
7915
7916
7917/*
7918 * PCMPEQD / VPCMPEQD.
7919 */
7920#ifdef IEM_WITHOUT_ASSEMBLY
7921
7922IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7923{
7924 RT_NOREF(pFpuState);
7925 RTUINT64U uSrc1 = { *puDst };
7926 RTUINT64U uSrc2 = { *puSrc };
7927 RTUINT64U uDst;
7928 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7929 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7930 *puDst = uDst.u;
7931}
7932
7933
7934IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7935{
7936 RT_NOREF(pFpuState);
7937 RTUINT128U uSrc1 = *puDst;
7938 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7939 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7940 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7941 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7942}
7943
7944#endif /* IEM_WITHOUT_ASSEMBLY */
7945
7946IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7947 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7948{
7949 RT_NOREF(pExtState);
7950 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7951 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7952 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7953 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7954}
7955
7956IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7957 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7958{
7959 RT_NOREF(pExtState);
7960 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7961 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7962 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7963 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7964 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
7965 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
7966 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
7967 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
7968}
7969
7970
7971/*
7972 * PCMPEQQ / VPCMPEQQ.
7973 */
7974IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7975{
7976 RT_NOREF(pFpuState);
7977 RTUINT128U uSrc1 = *puDst;
7978 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
7979 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
7980}
7981
7982IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7983 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7984{
7985 RT_NOREF(pExtState);
7986 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7987 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7988}
7989
7990IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7991 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7992{
7993 RT_NOREF(pExtState);
7994 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7995 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7996 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
7997 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
7998}
7999
8000
8001/*
8002 * PCMPGTB / VPCMPGTB
8003 */
8004#ifdef IEM_WITHOUT_ASSEMBLY
8005
8006IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8007{
8008 RT_NOREF(pFpuState);
8009 RTUINT64U uSrc1 = { *puDst };
8010 RTUINT64U uSrc2 = { *puSrc };
8011 RTUINT64U uDst;
8012 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8013 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8014 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8015 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8016 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8017 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8018 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8019 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8020 *puDst = uDst.u;
8021}
8022
8023
8024IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8025{
8026 RT_NOREF(pFpuState);
8027 RTUINT128U uSrc1 = *puDst;
8028 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8029 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8030 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8031 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8032 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8033 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8034 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8035 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8036 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8037 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8038 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8039 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8040 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8041 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8042 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8043 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8044}
8045
8046#endif
8047
8048IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8049 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8050{
8051 RT_NOREF(pExtState);
8052 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8053 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8054 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8055 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8056 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8057 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8058 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8059 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8060 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8061 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8062 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8063 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8064 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8065 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8066 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8067 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8068}
8069
8070IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8071 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8072{
8073 RT_NOREF(pExtState);
8074 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8075 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8076 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8077 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8078 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8079 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8080 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8081 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8082 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8083 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8084 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8085 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8086 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8087 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8088 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8089 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8090 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8091 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8092 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8093 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8094 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8095 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8096 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8097 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8098 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8099 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8100 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8101 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8102 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8103 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8104 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8105 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8106}
8107
8108
8109/*
8110 * PCMPGTW / VPCMPGTW
8111 */
8112#ifdef IEM_WITHOUT_ASSEMBLY
8113
8114IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8115{
8116 RT_NOREF(pFpuState);
8117 RTUINT64U uSrc1 = { *puDst };
8118 RTUINT64U uSrc2 = { *puSrc };
8119 RTUINT64U uDst;
8120 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8121 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8122 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8123 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8124 *puDst = uDst.u;
8125}
8126
8127
8128IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8129{
8130 RT_NOREF(pFpuState);
8131 RTUINT128U uSrc1 = *puDst;
8132 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8133 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8134 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8135 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8136 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8137 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8138 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8139 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8140}
8141
8142#endif
8143
8144IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8145 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8146{
8147 RT_NOREF(pExtState);
8148 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8149 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8150 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8151 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8152 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8153 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8154 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8155 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8156}
8157
8158IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8159 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8160{
8161 RT_NOREF(pExtState);
8162 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8163 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8164 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8165 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8166 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8167 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8168 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8169 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8170 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8171 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8172 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8173 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8174 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8175 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8176 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8177 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8178}
8179
8180
8181/*
8182 * PCMPGTD / VPCMPGTD.
8183 */
8184#ifdef IEM_WITHOUT_ASSEMBLY
8185
8186IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8187{
8188 RT_NOREF(pFpuState);
8189 RTUINT64U uSrc1 = { *puDst };
8190 RTUINT64U uSrc2 = { *puSrc };
8191 RTUINT64U uDst;
8192 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8193 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8194 *puDst = uDst.u;
8195}
8196
8197
8198IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8199{
8200 RT_NOREF(pFpuState);
8201 RTUINT128U uSrc1 = *puDst;
8202 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8203 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8204 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8205 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8206}
8207
8208#endif /* IEM_WITHOUT_ASSEMBLY */
8209
8210IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8211 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8212{
8213 RT_NOREF(pExtState);
8214 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8215 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8216 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8217 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8218}
8219
8220IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8221 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8222{
8223 RT_NOREF(pExtState);
8224 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8225 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8226 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8227 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8228 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8229 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8230 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8231 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8232}
8233
8234
8235/*
8236 * PCMPGTQ / VPCMPGTQ.
8237 */
8238IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8239{
8240 RT_NOREF(pFpuState);
8241 RTUINT128U uSrc1 = *puDst;
8242 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8243 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8244}
8245
8246IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8247 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8248{
8249 RT_NOREF(pExtState);
8250 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8251 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8252}
8253
8254IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8255 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8256{
8257 RT_NOREF(pExtState);
8258 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8259 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8260 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8261 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8262}
8263
8264
8265/*
8266 * PADDB / VPADDB
8267 */
8268#ifdef IEM_WITHOUT_ASSEMBLY
8269
8270IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8271{
8272 RT_NOREF(pFpuState);
8273 RTUINT64U uSrc1 = { *puDst };
8274 RTUINT64U uSrc2 = { *puSrc };
8275 RTUINT64U uDst;
8276 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8277 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8278 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8279 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8280 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8281 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8282 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8283 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8284 *puDst = uDst.u;
8285}
8286
8287
8288IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8289{
8290 RT_NOREF(pFpuState);
8291 RTUINT128U uSrc1 = *puDst;
8292 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8293 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8294 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8295 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8296 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8297 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8298 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8299 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8300 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8301 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8302 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8303 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8304 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8305 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8306 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8307 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8308}
8309
8310#endif
8311
8312
8313IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8314 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8315{
8316 RT_NOREF(pExtState);
8317 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8318 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8319 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8320 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8321 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8322 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8323 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8324 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8325 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8326 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8327 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8328 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8329 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8330 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8331 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8332 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8333}
8334
8335IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8336 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8337{
8338 RT_NOREF(pExtState);
8339 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8340 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8341 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8342 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8343 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8344 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8345 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8346 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8347 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8348 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8349 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8350 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8351 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8352 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8353 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8354 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8355 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8356 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8357 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8358 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8359 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8360 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8361 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8362 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8363 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8364 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8365 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8366 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8367 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8368 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8369 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8370 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8371}
8372
8373
8374/*
8375 * PADDSB / VPADDSB
8376 */
8377#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8378 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8379 ? (uint8_t)(a_iWord) \
8380 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8381
8382#ifdef IEM_WITHOUT_ASSEMBLY
8383
8384IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8385{
8386 RT_NOREF(pFpuState);
8387 RTUINT64U uSrc1 = { *puDst };
8388 RTUINT64U uSrc2 = { *puSrc };
8389 RTUINT64U uDst;
8390 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8391 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8392 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8393 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8394 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8395 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8396 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8397 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8398 *puDst = uDst.u;
8399}
8400
8401
8402IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8403{
8404 RT_NOREF(pFpuState);
8405 RTUINT128U uSrc1 = *puDst;
8406 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8407 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8408 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8409 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8410 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8411 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8412 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8413 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8414 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8415 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8416 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8417 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8418 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8419 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8420 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8421 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8422}
8423
8424#endif
8425
8426
8427/*
8428 * PADDSB / VPADDSB
8429 */
8430#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8431 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8432 ? (uint8_t)(a_uWord) \
8433 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8434
8435#ifdef IEM_WITHOUT_ASSEMBLY
8436
8437IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8438{
8439 RT_NOREF(pFpuState);
8440 RTUINT64U uSrc1 = { *puDst };
8441 RTUINT64U uSrc2 = { *puSrc };
8442 RTUINT64U uDst;
8443 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8444 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8445 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8446 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8447 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8448 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8449 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8450 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8451 *puDst = uDst.u;
8452}
8453
8454
8455IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8456{
8457 RT_NOREF(pFpuState);
8458 RTUINT128U uSrc1 = *puDst;
8459 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8460 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8461 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8462 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8463 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8464 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8465 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8466 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8467 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8468 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8469 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8470 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8471 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8472 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8473 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8474 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8475}
8476
8477#endif
8478
8479
8480/*
8481 * PADDW / VPADDW
8482 */
8483#ifdef IEM_WITHOUT_ASSEMBLY
8484
8485IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8486{
8487 RT_NOREF(pFpuState);
8488 RTUINT64U uSrc1 = { *puDst };
8489 RTUINT64U uSrc2 = { *puSrc };
8490 RTUINT64U uDst;
8491 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8492 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8493 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8494 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8495 *puDst = uDst.u;
8496}
8497
8498
8499IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8500{
8501 RT_NOREF(pFpuState);
8502 RTUINT128U uSrc1 = *puDst;
8503 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8504 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8505 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8506 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8507 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8508 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8509 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8510 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8511}
8512
8513#endif
8514
8515
8516IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8517 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8518{
8519 RT_NOREF(pExtState);
8520 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8521 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8522 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8523 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8524 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8525 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8526 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8527 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8528}
8529
8530IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8531 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8532{
8533 RT_NOREF(pExtState);
8534 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8535 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8536 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8537 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8538 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8539 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8540 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8541 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8542 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8543 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8544 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8545 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8546 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8547 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8548 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8549 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8550}
8551
8552
8553/*
8554 * PADDSW / VPADDSW
8555 */
8556#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8557 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8558 ? (uint16_t)(a_iDword) \
8559 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8560
8561#ifdef IEM_WITHOUT_ASSEMBLY
8562
8563IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8564{
8565 RT_NOREF(pFpuState);
8566 RTUINT64U uSrc1 = { *puDst };
8567 RTUINT64U uSrc2 = { *puSrc };
8568 RTUINT64U uDst;
8569 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8570 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8571 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8572 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8573 *puDst = uDst.u;
8574}
8575
8576
8577IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8578{
8579 RT_NOREF(pFpuState);
8580 RTUINT128U uSrc1 = *puDst;
8581 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8582 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8583 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8584 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8585 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8586 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8587 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8588 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8589}
8590
8591#endif
8592
8593
8594/*
8595 * PADDUSW / VPADDUSW
8596 */
8597#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8598 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8599 ? (uint16_t)(a_uDword) \
8600 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8601
8602#ifdef IEM_WITHOUT_ASSEMBLY
8603
8604IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8605{
8606 RT_NOREF(pFpuState);
8607 RTUINT64U uSrc1 = { *puDst };
8608 RTUINT64U uSrc2 = { *puSrc };
8609 RTUINT64U uDst;
8610 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8611 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8612 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8613 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8614 *puDst = uDst.u;
8615}
8616
8617
8618IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8619{
8620 RT_NOREF(pFpuState);
8621 RTUINT128U uSrc1 = *puDst;
8622 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8623 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8624 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8625 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8626 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8627 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8628 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8629 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8630}
8631
8632#endif
8633
8634
8635/*
8636 * PADDD / VPADDD.
8637 */
8638#ifdef IEM_WITHOUT_ASSEMBLY
8639
8640IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8641{
8642 RT_NOREF(pFpuState);
8643 RTUINT64U uSrc1 = { *puDst };
8644 RTUINT64U uSrc2 = { *puSrc };
8645 RTUINT64U uDst;
8646 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8647 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8648 *puDst = uDst.u;
8649}
8650
8651
8652IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8653{
8654 RT_NOREF(pFpuState);
8655 RTUINT128U uSrc1 = *puDst;
8656 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8657 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8658 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8659 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8660}
8661
8662#endif /* IEM_WITHOUT_ASSEMBLY */
8663
8664IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8665 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8666{
8667 RT_NOREF(pExtState);
8668 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8669 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8670 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8671 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8672}
8673
8674IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8675 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8676{
8677 RT_NOREF(pExtState);
8678 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8679 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8680 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8681 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8682 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8683 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8684 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8685 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8686}
8687
8688
8689/*
8690 * PADDQ / VPADDQ.
8691 */
8692#ifdef IEM_WITHOUT_ASSEMBLY
8693
8694IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8695{
8696 RT_NOREF(pFpuState);
8697 *puDst = *puDst + *puSrc;
8698}
8699
8700IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8701{
8702 RT_NOREF(pFpuState);
8703 RTUINT128U uSrc1 = *puDst;
8704 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8705 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8706}
8707
8708#endif
8709
8710IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8711 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8712{
8713 RT_NOREF(pExtState);
8714 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8715 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8716}
8717
8718IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8719 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8720{
8721 RT_NOREF(pExtState);
8722 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8723 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8724 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8725 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8726}
8727
8728
8729/*
8730 * PSUBB / VPSUBB
8731 */
8732#ifdef IEM_WITHOUT_ASSEMBLY
8733
8734IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8735{
8736 RT_NOREF(pFpuState);
8737 RTUINT64U uSrc1 = { *puDst };
8738 RTUINT64U uSrc2 = { *puSrc };
8739 RTUINT64U uDst;
8740 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8741 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8742 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8743 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8744 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8745 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8746 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8747 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8748 *puDst = uDst.u;
8749}
8750
8751
8752IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8753{
8754 RT_NOREF(pFpuState);
8755 RTUINT128U uSrc1 = *puDst;
8756 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8757 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8758 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8759 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8760 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8761 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8762 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8763 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8764 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8765 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8766 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8767 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8768 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8769 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8770 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8771 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8772}
8773
8774#endif
8775
8776IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8777 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8778{
8779 RT_NOREF(pExtState);
8780 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8781 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8782 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8783 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8784 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8785 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8786 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8787 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8788 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8789 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8790 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8791 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8792 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8793 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8794 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8795 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8796}
8797
8798IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8799 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8800{
8801 RT_NOREF(pExtState);
8802 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8803 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8804 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8805 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8806 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8807 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8808 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8809 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8810 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8811 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8812 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8813 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8814 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8815 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8816 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8817 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8818 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8819 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8820 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8821 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8822 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8823 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8824 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8825 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8826 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8827 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8828 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8829 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8830 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8831 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8832 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8833 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8834}
8835
8836
8837/*
8838 * PSUBSB / VSUBSB
8839 */
8840#ifdef IEM_WITHOUT_ASSEMBLY
8841
8842IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8843{
8844 RT_NOREF(pFpuState);
8845 RTUINT64U uSrc1 = { *puDst };
8846 RTUINT64U uSrc2 = { *puSrc };
8847 RTUINT64U uDst;
8848 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8849 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8850 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8851 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8852 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8853 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8854 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8855 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8856 *puDst = uDst.u;
8857}
8858
8859
8860IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8861{
8862 RT_NOREF(pFpuState);
8863 RTUINT128U uSrc1 = *puDst;
8864 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8865 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8866 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8867 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8868 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8869 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8870 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8871 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8872 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8873 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8874 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8875 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8876 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8877 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8878 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8879 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8880}
8881
8882#endif
8883
8884
8885/*
8886 * PADDSB / VPADDSB
8887 */
8888#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8889 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8890 ? (uint8_t)(a_uWord) \
8891 : (uint8_t)0 )
8892
8893#ifdef IEM_WITHOUT_ASSEMBLY
8894
8895IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8896{
8897 RT_NOREF(pFpuState);
8898 RTUINT64U uSrc1 = { *puDst };
8899 RTUINT64U uSrc2 = { *puSrc };
8900 RTUINT64U uDst;
8901 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
8902 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
8903 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
8904 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
8905 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
8906 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
8907 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
8908 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
8909 *puDst = uDst.u;
8910}
8911
8912
8913IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8914{
8915 RT_NOREF(pFpuState);
8916 RTUINT128U uSrc1 = *puDst;
8917 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
8918 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
8919 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
8920 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
8921 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
8922 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
8923 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
8924 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
8925 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
8926 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
8927 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
8928 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
8929 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
8930 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
8931 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
8932 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
8933}
8934
8935#endif
8936
8937
8938/*
8939 * PSUBW / VPSUBW
8940 */
8941#ifdef IEM_WITHOUT_ASSEMBLY
8942
8943IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8944{
8945 RT_NOREF(pFpuState);
8946 RTUINT64U uSrc1 = { *puDst };
8947 RTUINT64U uSrc2 = { *puSrc };
8948 RTUINT64U uDst;
8949 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
8950 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
8951 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
8952 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
8953 *puDst = uDst.u;
8954}
8955
8956
8957IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8958{
8959 RT_NOREF(pFpuState);
8960 RTUINT128U uSrc1 = *puDst;
8961 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
8962 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
8963 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
8964 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
8965 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
8966 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
8967 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
8968 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
8969}
8970
8971#endif
8972
8973IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8974 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8975{
8976 RT_NOREF(pExtState);
8977 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8978 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8979 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8980 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8981 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8982 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8983 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8984 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8985}
8986
8987IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8988 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8989{
8990 RT_NOREF(pExtState);
8991 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8992 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8993 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8994 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8995 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8996 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8997 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8998 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8999 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9000 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9001 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9002 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9003 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9004 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9005 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9006 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9007}
9008
9009
9010/*
9011 * PSUBSW / VPSUBSW
9012 */
9013#ifdef IEM_WITHOUT_ASSEMBLY
9014
9015IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9016{
9017 RT_NOREF(pFpuState);
9018 RTUINT64U uSrc1 = { *puDst };
9019 RTUINT64U uSrc2 = { *puSrc };
9020 RTUINT64U uDst;
9021 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9022 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9023 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9024 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9025 *puDst = uDst.u;
9026}
9027
9028
9029IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9030{
9031 RT_NOREF(pFpuState);
9032 RTUINT128U uSrc1 = *puDst;
9033 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9034 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9035 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9036 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9037 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9038 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9039 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9040 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9041}
9042
9043#endif
9044
9045
9046/*
9047 * PSUBUSW / VPSUBUSW
9048 */
9049#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9050 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9051 ? (uint16_t)(a_uDword) \
9052 : (uint16_t)0 )
9053
9054#ifdef IEM_WITHOUT_ASSEMBLY
9055
9056IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9057{
9058 RT_NOREF(pFpuState);
9059 RTUINT64U uSrc1 = { *puDst };
9060 RTUINT64U uSrc2 = { *puSrc };
9061 RTUINT64U uDst;
9062 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9063 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9064 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9065 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9066 *puDst = uDst.u;
9067}
9068
9069
9070IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9071{
9072 RT_NOREF(pFpuState);
9073 RTUINT128U uSrc1 = *puDst;
9074 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9075 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9076 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9077 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9078 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9079 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9080 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9081 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9082}
9083
9084#endif
9085
9086
9087/*
9088 * PSUBD / VPSUBD.
9089 */
9090#ifdef IEM_WITHOUT_ASSEMBLY
9091
9092IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9093{
9094 RT_NOREF(pFpuState);
9095 RTUINT64U uSrc1 = { *puDst };
9096 RTUINT64U uSrc2 = { *puSrc };
9097 RTUINT64U uDst;
9098 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9099 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9100 *puDst = uDst.u;
9101}
9102
9103
9104IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9105{
9106 RT_NOREF(pFpuState);
9107 RTUINT128U uSrc1 = *puDst;
9108 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9109 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9110 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9111 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9112}
9113
9114#endif /* IEM_WITHOUT_ASSEMBLY */
9115
9116IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9117 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9118{
9119 RT_NOREF(pExtState);
9120 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9121 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9122 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9123 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9124}
9125
9126IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9127 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9128{
9129 RT_NOREF(pExtState);
9130 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9131 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9132 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9133 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9134 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9135 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9136 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9137 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9138}
9139
9140
9141/*
9142 * PSUBQ / VPSUBQ.
9143 */
9144#ifdef IEM_WITHOUT_ASSEMBLY
9145
9146IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9147{
9148 RT_NOREF(pFpuState);
9149 *puDst = *puDst - *puSrc;
9150}
9151
9152IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9153{
9154 RT_NOREF(pFpuState);
9155 RTUINT128U uSrc1 = *puDst;
9156 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9157 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9158}
9159
9160#endif
9161
9162IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9163 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9164{
9165 RT_NOREF(pExtState);
9166 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9167 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9168}
9169
9170IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9171 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9172{
9173 RT_NOREF(pExtState);
9174 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9175 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9176 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9177 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9178}
9179
9180
9181
9182/*
9183 * PMULLW / VPMULLW / PMULLD / VPMULLD
9184 */
9185#ifdef IEM_WITHOUT_ASSEMBLY
9186
9187IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9188{
9189 RT_NOREF(pFpuState);
9190 RTUINT64U uSrc1 = { *puDst };
9191 RTUINT64U uSrc2 = { *puSrc };
9192 RTUINT64U uDst;
9193 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9194 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9195 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9196 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9197 *puDst = uDst.u;
9198}
9199
9200
9201IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9202{
9203 RT_NOREF(pFpuState);
9204 RTUINT128U uSrc1 = *puDst;
9205 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9206 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9207 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9208 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9209 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9210 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9211 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9212 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9213}
9214
9215#endif
9216
9217IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9218{
9219 RTUINT128U uSrc1 = *puDst;
9220
9221 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9222 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9223 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9224 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9225 RT_NOREF(pFpuState);
9226}
9227
9228
9229IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9230{
9231 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9232 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9233 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9234 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9235 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9236 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9237 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9238 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9239}
9240
9241
9242IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9243{
9244 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9245 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9246 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9247 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9248 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9249 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9250 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9251 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9252 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9253 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9254 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9255 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9256 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9257 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9258 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9259 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9260}
9261
9262
9263IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9264{
9265 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9266 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9267 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9268 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9269}
9270
9271
9272IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9273{
9274 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9275 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9276 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9277 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9278 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9279 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9280 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9281 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9282}
9283
9284
9285/*
9286 * PMULHW / VPMULHW
9287 */
9288#ifdef IEM_WITHOUT_ASSEMBLY
9289
9290IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9291{
9292 RT_NOREF(pFpuState);
9293 RTUINT64U uSrc1 = { *puDst };
9294 RTUINT64U uSrc2 = { *puSrc };
9295 RTUINT64U uDst;
9296 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9297 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9298 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9299 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9300 *puDst = uDst.u;
9301}
9302
9303
9304IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9305{
9306 RT_NOREF(pFpuState);
9307 RTUINT128U uSrc1 = *puDst;
9308 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9309 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9310 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9311 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9312 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9313 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9314 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9315 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9316}
9317
9318#endif
9319
9320IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9321{
9322 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9323 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9324 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9325 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9326 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9327 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9328 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9329 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9330}
9331
9332
9333IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9334{
9335 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9336 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9337 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9338 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9339 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9340 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9341 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9342 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9343 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9344 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9345 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9346 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9347 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9348 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9349 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9350 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9351}
9352
9353
9354/*
9355 * PMULHUW / VPMULHUW
9356 */
9357#ifdef IEM_WITHOUT_ASSEMBLY
9358
9359IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9360{
9361 RTUINT64U uSrc1 = { *puDst };
9362 RTUINT64U uSrc2 = { *puSrc };
9363 RTUINT64U uDst;
9364 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9365 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9366 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9367 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9368 *puDst = uDst.u;
9369}
9370
9371
9372IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9373{
9374 RTUINT128U uSrc1 = *puDst;
9375 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9376 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9377 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9378 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9379 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9380 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9381 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9382 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9383}
9384
9385#endif
9386
9387IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9388{
9389 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9390 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9391 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9392 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9393 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9394 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9395 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9396 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9397}
9398
9399
9400IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9401{
9402 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9403 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9404 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9405 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9406 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9407 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9408 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9409 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9410 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9411 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9412 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9413 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9414 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9415 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9416 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9417 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9418}
9419
9420
9421/*
9422 * PSRLW / VPSRLW
9423 */
9424#ifdef IEM_WITHOUT_ASSEMBLY
9425
9426IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9427{
9428 RTUINT64U uSrc1 = { *puDst };
9429 RTUINT64U uSrc2 = { *puSrc };
9430 RTUINT64U uDst;
9431
9432 if (uSrc2.au64[0] <= 15)
9433 {
9434 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9435 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9436 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9437 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9438 }
9439 else
9440 {
9441 uDst.au64[0] = 0;
9442 }
9443 *puDst = uDst.u;
9444}
9445
9446
9447IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9448{
9449 RTUINT64U uSrc1 = { *puDst };
9450 RTUINT64U uDst;
9451
9452 if (uShift <= 15)
9453 {
9454 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9455 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9456 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9457 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9458 }
9459 else
9460 {
9461 uDst.au64[0] = 0;
9462 }
9463 *puDst = uDst.u;
9464}
9465
9466
9467IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9468{
9469 RTUINT128U uSrc1 = *puDst;
9470
9471 if (puSrc->au64[0] <= 15)
9472 {
9473 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9474 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9475 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9476 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9477 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9478 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9479 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9480 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9481 }
9482 else
9483 {
9484 puDst->au64[0] = 0;
9485 puDst->au64[1] = 0;
9486 }
9487}
9488
9489IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9490{
9491 RTUINT128U uSrc1 = *puDst;
9492
9493 if (uShift <= 15)
9494 {
9495 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9496 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9497 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9498 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9499 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9500 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9501 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9502 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9503 }
9504 else
9505 {
9506 puDst->au64[0] = 0;
9507 puDst->au64[1] = 0;
9508 }
9509}
9510
9511#endif
9512
9513
9514/*
9515 * PSRAW / VPSRAW
9516 */
9517#ifdef IEM_WITHOUT_ASSEMBLY
9518
9519IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9520{
9521 RTUINT64U uSrc1 = { *puDst };
9522 RTUINT64U uSrc2 = { *puSrc };
9523 RTUINT64U uDst;
9524
9525 if (uSrc2.au64[0] <= 15)
9526 {
9527 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9528 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9529 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9530 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9531 }
9532 else
9533 {
9534 uDst.au64[0] = 0;
9535 }
9536 *puDst = uDst.u;
9537}
9538
9539
9540IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9541{
9542 RTUINT64U uSrc1 = { *puDst };
9543 RTUINT64U uDst;
9544
9545 if (uShift <= 15)
9546 {
9547 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9548 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9549 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9550 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9551 }
9552 else
9553 {
9554 uDst.au64[0] = 0;
9555 }
9556 *puDst = uDst.u;
9557}
9558
9559
9560IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9561{
9562 RTUINT128U uSrc1 = *puDst;
9563
9564 if (puSrc->au64[0] <= 15)
9565 {
9566 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9567 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9568 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9569 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9570 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9571 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9572 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9573 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9574 }
9575 else
9576 {
9577 puDst->au64[0] = 0;
9578 puDst->au64[1] = 0;
9579 }
9580}
9581
9582IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9583{
9584 RTUINT128U uSrc1 = *puDst;
9585
9586 if (uShift <= 15)
9587 {
9588 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9589 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9590 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9591 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9592 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9593 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9594 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9595 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9596 }
9597 else
9598 {
9599 puDst->au64[0] = 0;
9600 puDst->au64[1] = 0;
9601 }
9602}
9603
9604#endif
9605
9606
9607/*
9608 * PSLLW / VPSLLW
9609 */
9610#ifdef IEM_WITHOUT_ASSEMBLY
9611
9612IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9613{
9614 RTUINT64U uSrc1 = { *puDst };
9615 RTUINT64U uSrc2 = { *puSrc };
9616 RTUINT64U uDst;
9617
9618 if (uSrc2.au64[0] <= 15)
9619 {
9620 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9621 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9622 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9623 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9624 }
9625 else
9626 {
9627 uDst.au64[0] = 0;
9628 }
9629 *puDst = uDst.u;
9630}
9631
9632
9633IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9634{
9635 RTUINT64U uSrc1 = { *puDst };
9636 RTUINT64U uDst;
9637
9638 if (uShift <= 15)
9639 {
9640 uDst.au16[0] = uSrc1.au16[0] << uShift;
9641 uDst.au16[1] = uSrc1.au16[1] << uShift;
9642 uDst.au16[2] = uSrc1.au16[2] << uShift;
9643 uDst.au16[3] = uSrc1.au16[3] << uShift;
9644 }
9645 else
9646 {
9647 uDst.au64[0] = 0;
9648 }
9649 *puDst = uDst.u;
9650}
9651
9652
9653IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9654{
9655 RTUINT128U uSrc1 = *puDst;
9656
9657 if (puSrc->au64[0] <= 15)
9658 {
9659 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9660 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9661 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9662 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9663 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9664 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9665 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9666 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9667 }
9668 else
9669 {
9670 puDst->au64[0] = 0;
9671 puDst->au64[1] = 0;
9672 }
9673}
9674
9675IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9676{
9677 RTUINT128U uSrc1 = *puDst;
9678
9679 if (uShift <= 15)
9680 {
9681 puDst->au16[0] = uSrc1.au16[0] << uShift;
9682 puDst->au16[1] = uSrc1.au16[1] << uShift;
9683 puDst->au16[2] = uSrc1.au16[2] << uShift;
9684 puDst->au16[3] = uSrc1.au16[3] << uShift;
9685 puDst->au16[4] = uSrc1.au16[4] << uShift;
9686 puDst->au16[5] = uSrc1.au16[5] << uShift;
9687 puDst->au16[6] = uSrc1.au16[6] << uShift;
9688 puDst->au16[7] = uSrc1.au16[7] << uShift;
9689 }
9690 else
9691 {
9692 puDst->au64[0] = 0;
9693 puDst->au64[1] = 0;
9694 }
9695}
9696
9697#endif
9698
9699
9700/*
9701 * PSRLD / VPSRLD
9702 */
9703#ifdef IEM_WITHOUT_ASSEMBLY
9704
9705IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9706{
9707 RTUINT64U uSrc1 = { *puDst };
9708 RTUINT64U uSrc2 = { *puSrc };
9709 RTUINT64U uDst;
9710
9711 if (uSrc2.au64[0] <= 31)
9712 {
9713 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9714 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9715 }
9716 else
9717 {
9718 uDst.au64[0] = 0;
9719 }
9720 *puDst = uDst.u;
9721}
9722
9723
9724IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9725{
9726 RTUINT64U uSrc1 = { *puDst };
9727 RTUINT64U uDst;
9728
9729 if (uShift <= 31)
9730 {
9731 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9732 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9733 }
9734 else
9735 {
9736 uDst.au64[0] = 0;
9737 }
9738 *puDst = uDst.u;
9739}
9740
9741
9742IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9743{
9744 RTUINT128U uSrc1 = *puDst;
9745
9746 if (puSrc->au64[0] <= 31)
9747 {
9748 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9749 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9750 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9751 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9752 }
9753 else
9754 {
9755 puDst->au64[0] = 0;
9756 puDst->au64[1] = 0;
9757 }
9758}
9759
9760IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9761{
9762 RTUINT128U uSrc1 = *puDst;
9763
9764 if (uShift <= 31)
9765 {
9766 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9767 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9768 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9769 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9770 }
9771 else
9772 {
9773 puDst->au64[0] = 0;
9774 puDst->au64[1] = 0;
9775 }
9776}
9777
9778#endif
9779
9780
9781/*
9782 * PSRAD / VPSRAD
9783 */
9784#ifdef IEM_WITHOUT_ASSEMBLY
9785
9786IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9787{
9788 RTUINT64U uSrc1 = { *puDst };
9789 RTUINT64U uSrc2 = { *puSrc };
9790 RTUINT64U uDst;
9791
9792 if (uSrc2.au64[0] <= 31)
9793 {
9794 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9795 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9796 }
9797 else
9798 {
9799 uDst.au64[0] = 0;
9800 }
9801 *puDst = uDst.u;
9802}
9803
9804
9805IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9806{
9807 RTUINT64U uSrc1 = { *puDst };
9808 RTUINT64U uDst;
9809
9810 if (uShift <= 31)
9811 {
9812 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9813 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9814 }
9815 else
9816 {
9817 uDst.au64[0] = 0;
9818 }
9819 *puDst = uDst.u;
9820}
9821
9822
9823IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9824{
9825 RTUINT128U uSrc1 = *puDst;
9826
9827 if (puSrc->au64[0] <= 31)
9828 {
9829 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9830 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9831 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9832 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9833 }
9834 else
9835 {
9836 puDst->au64[0] = 0;
9837 puDst->au64[1] = 0;
9838 }
9839}
9840
9841IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9842{
9843 RTUINT128U uSrc1 = *puDst;
9844
9845 if (uShift <= 31)
9846 {
9847 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9848 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9849 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9850 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9851 }
9852 else
9853 {
9854 puDst->au64[0] = 0;
9855 puDst->au64[1] = 0;
9856 }
9857}
9858
9859#endif
9860
9861
9862/*
9863 * PSLLD / VPSLLD
9864 */
9865#ifdef IEM_WITHOUT_ASSEMBLY
9866
9867IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9868{
9869 RTUINT64U uSrc1 = { *puDst };
9870 RTUINT64U uSrc2 = { *puSrc };
9871 RTUINT64U uDst;
9872
9873 if (uSrc2.au64[0] <= 31)
9874 {
9875 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9876 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9877 }
9878 else
9879 {
9880 uDst.au64[0] = 0;
9881 }
9882 *puDst = uDst.u;
9883}
9884
9885
9886IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9887{
9888 RTUINT64U uSrc1 = { *puDst };
9889 RTUINT64U uDst;
9890
9891 if (uShift <= 31)
9892 {
9893 uDst.au32[0] = uSrc1.au32[0] << uShift;
9894 uDst.au32[1] = uSrc1.au32[1] << uShift;
9895 }
9896 else
9897 {
9898 uDst.au64[0] = 0;
9899 }
9900 *puDst = uDst.u;
9901}
9902
9903
9904IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9905{
9906 RTUINT128U uSrc1 = *puDst;
9907
9908 if (puSrc->au64[0] <= 31)
9909 {
9910 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
9911 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
9912 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
9913 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
9914 }
9915 else
9916 {
9917 puDst->au64[0] = 0;
9918 puDst->au64[1] = 0;
9919 }
9920}
9921
9922IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9923{
9924 RTUINT128U uSrc1 = *puDst;
9925
9926 if (uShift <= 31)
9927 {
9928 puDst->au32[0] = uSrc1.au32[0] << uShift;
9929 puDst->au32[1] = uSrc1.au32[1] << uShift;
9930 puDst->au32[2] = uSrc1.au32[2] << uShift;
9931 puDst->au32[3] = uSrc1.au32[3] << uShift;
9932 }
9933 else
9934 {
9935 puDst->au64[0] = 0;
9936 puDst->au64[1] = 0;
9937 }
9938}
9939
9940#endif
9941
9942
9943/*
9944 * PSRLQ / VPSRLQ
9945 */
9946#ifdef IEM_WITHOUT_ASSEMBLY
9947
9948IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9949{
9950 RTUINT64U uSrc1 = { *puDst };
9951 RTUINT64U uSrc2 = { *puSrc };
9952 RTUINT64U uDst;
9953
9954 if (uSrc2.au64[0] <= 63)
9955 {
9956 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
9957 }
9958 else
9959 {
9960 uDst.au64[0] = 0;
9961 }
9962 *puDst = uDst.u;
9963}
9964
9965
9966IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9967{
9968 RTUINT64U uSrc1 = { *puDst };
9969 RTUINT64U uDst;
9970
9971 if (uShift <= 63)
9972 {
9973 uDst.au64[0] = uSrc1.au64[0] >> uShift;
9974 }
9975 else
9976 {
9977 uDst.au64[0] = 0;
9978 }
9979 *puDst = uDst.u;
9980}
9981
9982
9983IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9984{
9985 RTUINT128U uSrc1 = *puDst;
9986
9987 if (puSrc->au64[0] <= 63)
9988 {
9989 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
9990 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
9991 }
9992 else
9993 {
9994 puDst->au64[0] = 0;
9995 puDst->au64[1] = 0;
9996 }
9997}
9998
9999IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10000{
10001 RTUINT128U uSrc1 = *puDst;
10002
10003 if (uShift <= 63)
10004 {
10005 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10006 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10007 }
10008 else
10009 {
10010 puDst->au64[0] = 0;
10011 puDst->au64[1] = 0;
10012 }
10013}
10014
10015#endif
10016
10017
10018/*
10019 * PSLLQ / VPSLLQ
10020 */
10021#ifdef IEM_WITHOUT_ASSEMBLY
10022
10023IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10024{
10025 RTUINT64U uSrc1 = { *puDst };
10026 RTUINT64U uSrc2 = { *puSrc };
10027 RTUINT64U uDst;
10028
10029 if (uSrc2.au64[0] <= 63)
10030 {
10031 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10032 }
10033 else
10034 {
10035 uDst.au64[0] = 0;
10036 }
10037 *puDst = uDst.u;
10038}
10039
10040
10041IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10042{
10043 RTUINT64U uSrc1 = { *puDst };
10044 RTUINT64U uDst;
10045
10046 if (uShift <= 63)
10047 {
10048 uDst.au64[0] = uSrc1.au64[0] << uShift;
10049 }
10050 else
10051 {
10052 uDst.au64[0] = 0;
10053 }
10054 *puDst = uDst.u;
10055}
10056
10057
10058IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10059{
10060 RTUINT128U uSrc1 = *puDst;
10061
10062 if (puSrc->au64[0] <= 63)
10063 {
10064 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10065 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10066 }
10067 else
10068 {
10069 puDst->au64[0] = 0;
10070 puDst->au64[1] = 0;
10071 }
10072}
10073
10074IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10075{
10076 RTUINT128U uSrc1 = *puDst;
10077
10078 if (uShift <= 63)
10079 {
10080 puDst->au64[0] = uSrc1.au64[0] << uShift;
10081 puDst->au64[1] = uSrc1.au64[1] << uShift;
10082 }
10083 else
10084 {
10085 puDst->au64[0] = 0;
10086 puDst->au64[1] = 0;
10087 }
10088}
10089
10090#endif
10091
10092
10093/*
10094 * PSRLDQ / VPSRLDQ
10095 */
10096#ifdef IEM_WITHOUT_ASSEMBLY
10097
10098IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10099{
10100 RTUINT128U uSrc1 = *puDst;
10101
10102 if (uShift < 16)
10103 {
10104 int i;
10105
10106 for (i = 0; i < 16 - uShift; ++i)
10107 puDst->au8[i] = uSrc1.au8[i + uShift];
10108 for (i = 16 - uShift; i < 16; ++i)
10109 puDst->au8[i] = 0;
10110 }
10111 else
10112 {
10113 puDst->au64[0] = 0;
10114 puDst->au64[1] = 0;
10115 }
10116}
10117
10118#endif
10119
10120
10121/*
10122 * PSLLDQ / VPSLLDQ
10123 */
10124#ifdef IEM_WITHOUT_ASSEMBLY
10125
10126IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10127{
10128 RTUINT128U uSrc1 = *puDst;
10129
10130 if (uShift < 16)
10131 {
10132 int i;
10133
10134 for (i = 0; i < uShift; ++i)
10135 puDst->au8[i] = 0;
10136 for (i = uShift; i < 16; ++i)
10137 puDst->au8[i] = uSrc1.au8[i - uShift];
10138 }
10139 else
10140 {
10141 puDst->au64[0] = 0;
10142 puDst->au64[1] = 0;
10143 }
10144}
10145
10146#endif
10147
10148
10149/*
10150 * PMADDWD / VPMADDWD
10151 */
10152#ifdef IEM_WITHOUT_ASSEMBLY
10153
10154IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10155{
10156 RTUINT64U uSrc1 = { *puDst };
10157 RTUINT64U uSrc2 = { *puSrc };
10158 RTUINT64U uDst;
10159
10160 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10161 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10162 *puDst = uDst.u;
10163 RT_NOREF(pFpuState);
10164}
10165
10166
10167IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10168{
10169 RTUINT128U uSrc1 = *puDst;
10170
10171 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10172 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10173 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10174 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10175 RT_NOREF(pFpuState);
10176}
10177
10178#endif
10179
10180
10181/*
10182 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10183 */
10184#ifdef IEM_WITHOUT_ASSEMBLY
10185
10186IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10187{
10188 RTUINT64U uSrc1 = { *puDst };
10189 RTUINT64U uSrc2 = { *puSrc };
10190 RTUINT64U uDst;
10191
10192 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10193 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10194 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10195 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10196 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10197 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10198 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10199 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10200 *puDst = uDst.u;
10201 RT_NOREF(pFpuState);
10202}
10203
10204
10205IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10206{
10207 RTUINT128U uSrc1 = *puDst;
10208
10209 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10210 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10211 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10212 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10213 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10214 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10215 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10216 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10217 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10218 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10219 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10220 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10221 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10222 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10223 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10224 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10225 RT_NOREF(pFpuState);
10226}
10227
10228#endif
10229
10230
10231IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10232{
10233 RTUINT128U uSrc1 = *puDst;
10234
10235 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10236 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10237 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10238 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10239 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10240 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10241 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10242 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10243 RT_NOREF(pFpuState);
10244}
10245
10246
10247IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10248{
10249 RTUINT128U uSrc1 = *puDst;
10250
10251 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10252 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10253 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10254 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10255 RT_NOREF(pFpuState);
10256}
10257
10258
10259IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10260 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10261{
10262 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10263 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10264 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10265 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10266 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10267 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10268 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10269 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10270 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10271 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10272 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10273 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10274 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10275 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10276 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10277 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10278 RT_NOREF(pExtState);
10279}
10280
10281
10282IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10283 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10284{
10285 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10286 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10287 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10288 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10289 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10290 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10291 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10292 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10293 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10294 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10295 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10296 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10297 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10298 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10299 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10300 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10301 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10302 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10303 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10304 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10305 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10306 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10307 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10308 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10309 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10310 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10311 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10312 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10313 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10314 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10315 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10316 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10317 RT_NOREF(pExtState);
10318}
10319
10320
10321IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10322 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10323{
10324 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10325 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10326 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10327 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10328 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10329 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10330 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10331 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10332 RT_NOREF(pExtState);
10333}
10334
10335
10336IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10337 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10338{
10339 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10340 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10341 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10342 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10343 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10344 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10345 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10346 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10347 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10348 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10349 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10350 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10351 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10352 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10353 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10354 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10355 RT_NOREF(pExtState);
10356}
10357
10358
10359IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10360 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10361{
10362 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10363 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10364 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10365 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10366 RT_NOREF(pExtState);
10367}
10368
10369
10370IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10371 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10372{
10373 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10374 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10375 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10376 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10377 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10378 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10379 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10380 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10381 RT_NOREF(pExtState);
10382}
10383
10384
10385/*
10386 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10387 */
10388#ifdef IEM_WITHOUT_ASSEMBLY
10389
10390IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10391{
10392 RTUINT64U uSrc1 = { *puDst };
10393 RTUINT64U uSrc2 = { *puSrc };
10394 RTUINT64U uDst;
10395
10396 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10397 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10398 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10399 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10400 *puDst = uDst.u;
10401 RT_NOREF(pFpuState);
10402}
10403
10404
10405IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10406{
10407 RTUINT128U uSrc1 = *puDst;
10408
10409 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10410 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10411 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10412 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10413 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10414 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10415 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10416 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10417 RT_NOREF(pFpuState);
10418}
10419
10420#endif
10421
10422IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10423{
10424 RTUINT128U uSrc1 = *puDst;
10425
10426 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10427 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10428 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10429 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10430 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10431 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10432 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10433 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10434 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10435 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10436 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10437 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10438 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10439 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10440 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10441 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10442 RT_NOREF(pFpuState);
10443}
10444
10445
10446IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10447{
10448 RTUINT128U uSrc1 = *puDst;
10449
10450 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10451 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10452 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10453 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10454 RT_NOREF(pFpuState);
10455}
10456
10457
10458IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10459 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10460{
10461 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10462 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10463 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10464 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10465 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10466 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10467 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10468 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10469 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10470 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10471 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10472 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10473 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10474 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10475 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10476 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10477 RT_NOREF(pExtState);
10478}
10479
10480
10481IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10482 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10483{
10484 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10485 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10486 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10487 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10488 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10489 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10490 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10491 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10492 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10493 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10494 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10495 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10496 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10497 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10498 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10499 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10500 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10501 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10502 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10503 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10504 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10505 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10506 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10507 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10508 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10509 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10510 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10511 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10512 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10513 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10514 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10515 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10516 RT_NOREF(pExtState);
10517}
10518
10519
10520IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10521 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10522{
10523 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10524 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10525 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10526 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10527 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10528 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10529 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10530 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10531 RT_NOREF(pExtState);
10532}
10533
10534
10535IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10536 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10537{
10538 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10539 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10540 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10541 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10542 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10543 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10544 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10545 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10546 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10547 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10548 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10549 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10550 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10551 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10552 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10553 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10554 RT_NOREF(pExtState);
10555}
10556
10557
10558IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10559 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10560{
10561 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10562 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10563 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10564 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10565 RT_NOREF(pExtState);
10566}
10567
10568
10569IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10570 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10571{
10572 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10573 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10574 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10575 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10576 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10577 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10578 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10579 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10580 RT_NOREF(pExtState);
10581}
10582
10583
10584/*
10585 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10586 */
10587#ifdef IEM_WITHOUT_ASSEMBLY
10588
10589IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10590{
10591 RTUINT64U uSrc1 = { *puDst };
10592 RTUINT64U uSrc2 = { *puSrc };
10593 RTUINT64U uDst;
10594
10595 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10596 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10597 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10598 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10599 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10600 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10601 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10602 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10603 *puDst = uDst.u;
10604 RT_NOREF(pFpuState);
10605}
10606
10607
10608IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10609{
10610 RTUINT128U uSrc1 = *puDst;
10611
10612 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10613 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10614 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10615 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10616 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10617 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10618 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10619 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10620 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10621 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10622 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10623 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10624 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10625 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10626 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10627 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10628 RT_NOREF(pFpuState);
10629}
10630
10631#endif
10632
10633IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10634{
10635 RTUINT128U uSrc1 = *puDst;
10636
10637 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10638 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10639 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10640 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10641 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10642 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10643 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10644 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10645 RT_NOREF(pFpuState);
10646}
10647
10648
10649IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10650{
10651 RTUINT128U uSrc1 = *puDst;
10652
10653 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10654 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10655 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10656 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10657 RT_NOREF(pFpuState);
10658}
10659
10660
10661IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10662 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10663{
10664 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10665 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10666 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10667 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10668 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10669 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10670 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10671 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10672 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10673 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10674 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10675 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10676 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10677 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10678 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10679 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10680 RT_NOREF(pExtState);
10681}
10682
10683
10684IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10685 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10686{
10687 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10688 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10689 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10690 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10691 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10692 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10693 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10694 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10695 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10696 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10697 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10698 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10699 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10700 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10701 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10702 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10703 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10704 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10705 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10706 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10707 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10708 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10709 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10710 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10711 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10712 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10713 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10714 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10715 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10716 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10717 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10718 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10719 RT_NOREF(pExtState);
10720}
10721
10722
10723IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10724 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10725{
10726 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10727 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10728 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10729 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10730 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10731 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10732 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10733 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10734 RT_NOREF(pExtState);
10735}
10736
10737
10738IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10739 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10740{
10741 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10742 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10743 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10744 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10745 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10746 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10747 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10748 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10749 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10750 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10751 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10752 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10753 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10754 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10755 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10756 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10757 RT_NOREF(pExtState);
10758}
10759
10760
10761IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10762 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10763{
10764 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10765 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10766 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10767 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10768 RT_NOREF(pExtState);
10769}
10770
10771
10772IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10773 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10774{
10775 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10776 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10777 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10778 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10779 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10780 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10781 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10782 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10783 RT_NOREF(pExtState);
10784}
10785
10786
10787/*
10788 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10789 */
10790#ifdef IEM_WITHOUT_ASSEMBLY
10791
10792IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10793{
10794 RTUINT64U uSrc1 = { *puDst };
10795 RTUINT64U uSrc2 = { *puSrc };
10796 RTUINT64U uDst;
10797
10798 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10799 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10800 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10801 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10802 *puDst = uDst.u;
10803 RT_NOREF(pFpuState);
10804}
10805
10806
10807IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10808{
10809 RTUINT128U uSrc1 = *puDst;
10810
10811 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10812 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10813 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10814 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10815 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10816 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10817 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10818 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10819 RT_NOREF(pFpuState);
10820}
10821
10822#endif
10823
10824IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10825{
10826 RTUINT128U uSrc1 = *puDst;
10827
10828 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10829 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10830 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10831 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10832 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10833 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10834 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10835 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10836 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10837 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10838 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10839 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10840 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10841 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10842 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10843 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10844 RT_NOREF(pFpuState);
10845}
10846
10847
10848IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10849{
10850 RTUINT128U uSrc1 = *puDst;
10851
10852 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10853 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10854 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10855 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10856 RT_NOREF(pFpuState);
10857}
10858
10859
10860IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10861 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10862{
10863 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10864 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10865 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10866 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10867 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10868 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10869 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10870 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10871 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10872 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10873 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10874 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10875 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10876 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10877 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10878 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10879 RT_NOREF(pExtState);
10880}
10881
10882
10883IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10884 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10885{
10886 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10887 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10888 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10889 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10890 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10891 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10892 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10893 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10894 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10895 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10896 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10897 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10898 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10899 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10900 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10901 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10902 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
10903 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
10904 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
10905 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
10906 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
10907 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
10908 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
10909 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
10910 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
10911 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
10912 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
10913 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
10914 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
10915 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
10916 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
10917 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
10918 RT_NOREF(pExtState);
10919}
10920
10921
10922IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10923 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10924{
10925 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10926 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10927 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10928 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10929 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10930 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10931 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10932 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10933 RT_NOREF(pExtState);
10934}
10935
10936
10937IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10938 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10939{
10940 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10941 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10942 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10943 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10944 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10945 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10946 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10947 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10948 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10949 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10950 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
10951 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
10952 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
10953 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
10954 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
10955 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
10956 RT_NOREF(pExtState);
10957}
10958
10959
10960IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10961 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10962{
10963 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10964 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10965 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10966 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10967 RT_NOREF(pExtState);
10968}
10969
10970
10971IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10972 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10973{
10974 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10975 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10976 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10977 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10978 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10979 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10980 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10981 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10982 RT_NOREF(pExtState);
10983}
10984
10985
10986/*
10987 * PAVGB / VPAVGB / PAVGW / VPAVGW
10988 */
10989#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
10990#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
10991
10992#ifdef IEM_WITHOUT_ASSEMBLY
10993
10994IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
10995{
10996 RTUINT64U uSrc1 = { *puDst };
10997 RTUINT64U uSrc2 = { *puSrc };
10998 RTUINT64U uDst;
10999
11000 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11001 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11002 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11003 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11004 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11005 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11006 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11007 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11008 *puDst = uDst.u;
11009}
11010
11011
11012IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11013{
11014 RTUINT128U uSrc1 = *puDst;
11015
11016 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11017 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11018 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11019 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11020 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11021 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11022 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11023 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11024 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11025 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11026 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11027 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11028 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11029 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11030 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11031 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11032}
11033
11034
11035IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11036{
11037 RTUINT64U uSrc1 = { *puDst };
11038 RTUINT64U uSrc2 = { *puSrc };
11039 RTUINT64U uDst;
11040
11041 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11042 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11043 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11044 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11045 *puDst = uDst.u;
11046}
11047
11048
11049IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11050{
11051 RTUINT128U uSrc1 = *puDst;
11052
11053 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11054 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11055 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11056 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11057 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11058 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11059 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11060 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11061}
11062
11063#endif
11064
11065IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11066{
11067 RTUINT128U uSrc1 = *puDst;
11068
11069 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11070 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11071 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11072 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11073 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11074 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11075 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11076 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11077 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11078 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11079 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11080 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11081 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11082 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11083 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11084 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11085}
11086
11087
11088IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11089{
11090 RTUINT128U uSrc1 = *puDst;
11091
11092 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11093 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11094 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11095 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11096 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11097 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11098 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11099 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11100 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11101 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11102 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11103 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11104 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11105 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11106 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11107 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11108}
11109
11110
11111IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11112{
11113 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11114 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11115 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11116 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11117 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11118 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11119 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11120 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11121 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11122 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11123 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11124 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11125 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11126 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11127 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11128 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11129}
11130
11131
11132IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11133{
11134 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11135 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11136 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11137 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11138 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11139 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11140 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11141 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11142 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11143 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11144 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11145 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11146 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11147 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11148 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11149 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11150 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11151 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11152 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11153 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11154 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11155 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11156 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11157 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11158 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11159 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11160 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11161 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11162 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11163 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11164 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11165 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11166}
11167
11168
11169IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11170{
11171 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11172 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11173 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11174 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11175 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11176 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11177 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11178 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11179}
11180
11181
11182IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11183{
11184 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11185 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11186 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11187 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11188 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11189 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11190 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11191 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11192 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11193 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11194 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11195 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11196 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11197 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11198 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11199 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11200}
11201
11202#undef PAVGB_EXEC
11203#undef PAVGW_EXEC
11204
11205
11206/*
11207 * PMOVMSKB / VPMOVMSKB
11208 */
11209#ifdef IEM_WITHOUT_ASSEMBLY
11210
11211IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11212{
11213 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11214 uint64_t const uSrc = *pu64Src;
11215 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11216 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11217 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11218 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11219 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11220 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11221 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11222 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11223}
11224
11225
11226IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11227{
11228 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11229 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11230 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11231 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11232 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11233 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11234 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11235 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11236 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11237 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11238 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11239 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11240 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11241 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11242 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11243 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11244 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11245 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11246 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11247}
11248
11249#endif
11250
11251IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11252{
11253 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11254 uint64_t const uSrc0 = puSrc->QWords.qw0;
11255 uint64_t const uSrc1 = puSrc->QWords.qw1;
11256 uint64_t const uSrc2 = puSrc->QWords.qw2;
11257 uint64_t const uSrc3 = puSrc->QWords.qw3;
11258 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11259 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11260 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11261 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11262 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11263 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11264 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11265 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11266 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11267 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11268 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11269 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11270 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11271 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11272 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11273 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11274 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11275 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11276 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11277 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11278 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11279 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11280 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11281 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11282 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11283 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11284 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11285 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11286 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11287 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11288 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11289 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11290}
11291
11292
11293/*
11294 * [V]PSHUFB
11295 */
11296
11297IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11298{
11299 RTUINT64U const uSrc = { *puSrc };
11300 RTUINT64U const uDstIn = { *puDst };
11301 ASMCompilerBarrier();
11302 RTUINT64U uDstOut = { 0 };
11303 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11304 {
11305 uint8_t idxSrc = uSrc.au8[iByte];
11306 if (!(idxSrc & 0x80))
11307 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11308 }
11309 *puDst = uDstOut.u;
11310 RT_NOREF(pFpuState);
11311}
11312
11313
11314IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11315{
11316 RTUINT128U const uSrc = *puSrc;
11317 RTUINT128U const uDstIn = *puDst;
11318 ASMCompilerBarrier();
11319 puDst->au64[0] = 0;
11320 puDst->au64[1] = 0;
11321 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11322 {
11323 uint8_t idxSrc = uSrc.au8[iByte];
11324 if (!(idxSrc & 0x80))
11325 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11326 }
11327 RT_NOREF(pFpuState);
11328}
11329
11330
11331IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11332 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11333{
11334 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11335 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11336 ASMCompilerBarrier();
11337 puDst->au64[0] = 0;
11338 puDst->au64[1] = 0;
11339 for (unsigned iByte = 0; iByte < 16; iByte++)
11340 {
11341 uint8_t idxSrc = uSrc2.au8[iByte];
11342 if (!(idxSrc & 0x80))
11343 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11344 }
11345 RT_NOREF(pExtState);
11346}
11347
11348
11349IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11350 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11351{
11352 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11353 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11354 ASMCompilerBarrier();
11355 puDst->au64[0] = 0;
11356 puDst->au64[1] = 0;
11357 puDst->au64[2] = 0;
11358 puDst->au64[3] = 0;
11359 for (unsigned iByte = 0; iByte < 16; iByte++)
11360 {
11361 uint8_t idxSrc = uSrc2.au8[iByte];
11362 if (!(idxSrc & 0x80))
11363 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11364 }
11365 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11366 {
11367 uint8_t idxSrc = uSrc2.au8[iByte];
11368 if (!(idxSrc & 0x80))
11369 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11370 }
11371 RT_NOREF(pExtState);
11372}
11373
11374
11375/*
11376 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11377 */
11378#ifdef IEM_WITHOUT_ASSEMBLY
11379
11380IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11381{
11382 uint64_t const uSrc = *puSrc;
11383 ASMCompilerBarrier();
11384 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11385 uSrc >> (((bEvil >> 2) & 3) * 16),
11386 uSrc >> (((bEvil >> 4) & 3) * 16),
11387 uSrc >> (((bEvil >> 6) & 3) * 16));
11388}
11389
11390
11391IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11392{
11393 puDst->QWords.qw0 = puSrc->QWords.qw0;
11394 uint64_t const uSrc = puSrc->QWords.qw1;
11395 ASMCompilerBarrier();
11396 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11397 uSrc >> (((bEvil >> 2) & 3) * 16),
11398 uSrc >> (((bEvil >> 4) & 3) * 16),
11399 uSrc >> (((bEvil >> 6) & 3) * 16));
11400}
11401
11402#endif
11403
11404IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11405{
11406 puDst->QWords.qw0 = puSrc->QWords.qw0;
11407 uint64_t const uSrc1 = puSrc->QWords.qw1;
11408 puDst->QWords.qw2 = puSrc->QWords.qw2;
11409 uint64_t const uSrc3 = puSrc->QWords.qw3;
11410 ASMCompilerBarrier();
11411 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11412 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11413 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11414 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11415 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11416 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11417 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11418 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11419}
11420
11421#ifdef IEM_WITHOUT_ASSEMBLY
11422IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11423{
11424 puDst->QWords.qw1 = puSrc->QWords.qw1;
11425 uint64_t const uSrc = puSrc->QWords.qw0;
11426 ASMCompilerBarrier();
11427 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11428 uSrc >> (((bEvil >> 2) & 3) * 16),
11429 uSrc >> (((bEvil >> 4) & 3) * 16),
11430 uSrc >> (((bEvil >> 6) & 3) * 16));
11431
11432}
11433#endif
11434
11435
11436IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11437{
11438 puDst->QWords.qw3 = puSrc->QWords.qw3;
11439 uint64_t const uSrc2 = puSrc->QWords.qw2;
11440 puDst->QWords.qw1 = puSrc->QWords.qw1;
11441 uint64_t const uSrc0 = puSrc->QWords.qw0;
11442 ASMCompilerBarrier();
11443 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11444 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11445 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11446 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11447 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11448 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11449 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11450 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11451
11452}
11453
11454
11455#ifdef IEM_WITHOUT_ASSEMBLY
11456IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11457{
11458 RTUINT128U const uSrc = *puSrc;
11459 ASMCompilerBarrier();
11460 puDst->au32[0] = uSrc.au32[bEvil & 3];
11461 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11462 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11463 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11464}
11465#endif
11466
11467
11468IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11469{
11470 RTUINT256U const uSrc = *puSrc;
11471 ASMCompilerBarrier();
11472 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11473 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11474 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11475 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11476 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11477 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11478 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11479 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11480}
11481
11482
11483/*
11484 * PUNPCKHBW - high bytes -> words
11485 */
11486#ifdef IEM_WITHOUT_ASSEMBLY
11487
11488IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11489{
11490 RTUINT64U const uSrc2 = { *puSrc };
11491 RTUINT64U const uSrc1 = { *puDst };
11492 ASMCompilerBarrier();
11493 RTUINT64U uDstOut;
11494 uDstOut.au8[0] = uSrc1.au8[4];
11495 uDstOut.au8[1] = uSrc2.au8[4];
11496 uDstOut.au8[2] = uSrc1.au8[5];
11497 uDstOut.au8[3] = uSrc2.au8[5];
11498 uDstOut.au8[4] = uSrc1.au8[6];
11499 uDstOut.au8[5] = uSrc2.au8[6];
11500 uDstOut.au8[6] = uSrc1.au8[7];
11501 uDstOut.au8[7] = uSrc2.au8[7];
11502 *puDst = uDstOut.u;
11503}
11504
11505
11506IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11507{
11508 RTUINT128U const uSrc2 = *puSrc;
11509 RTUINT128U const uSrc1 = *puDst;
11510 ASMCompilerBarrier();
11511 RTUINT128U uDstOut;
11512 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11513 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11514 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11515 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11516 uDstOut.au8[ 4] = uSrc1.au8[10];
11517 uDstOut.au8[ 5] = uSrc2.au8[10];
11518 uDstOut.au8[ 6] = uSrc1.au8[11];
11519 uDstOut.au8[ 7] = uSrc2.au8[11];
11520 uDstOut.au8[ 8] = uSrc1.au8[12];
11521 uDstOut.au8[ 9] = uSrc2.au8[12];
11522 uDstOut.au8[10] = uSrc1.au8[13];
11523 uDstOut.au8[11] = uSrc2.au8[13];
11524 uDstOut.au8[12] = uSrc1.au8[14];
11525 uDstOut.au8[13] = uSrc2.au8[14];
11526 uDstOut.au8[14] = uSrc1.au8[15];
11527 uDstOut.au8[15] = uSrc2.au8[15];
11528 *puDst = uDstOut;
11529}
11530
11531#endif
11532
11533IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11534{
11535 RTUINT128U const uSrc2 = *puSrc2;
11536 RTUINT128U const uSrc1 = *puSrc1;
11537 ASMCompilerBarrier();
11538 RTUINT128U uDstOut;
11539 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11540 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11541 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11542 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11543 uDstOut.au8[ 4] = uSrc1.au8[10];
11544 uDstOut.au8[ 5] = uSrc2.au8[10];
11545 uDstOut.au8[ 6] = uSrc1.au8[11];
11546 uDstOut.au8[ 7] = uSrc2.au8[11];
11547 uDstOut.au8[ 8] = uSrc1.au8[12];
11548 uDstOut.au8[ 9] = uSrc2.au8[12];
11549 uDstOut.au8[10] = uSrc1.au8[13];
11550 uDstOut.au8[11] = uSrc2.au8[13];
11551 uDstOut.au8[12] = uSrc1.au8[14];
11552 uDstOut.au8[13] = uSrc2.au8[14];
11553 uDstOut.au8[14] = uSrc1.au8[15];
11554 uDstOut.au8[15] = uSrc2.au8[15];
11555 *puDst = uDstOut;
11556}
11557
11558
11559IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11560{
11561 RTUINT256U const uSrc2 = *puSrc2;
11562 RTUINT256U const uSrc1 = *puSrc1;
11563 ASMCompilerBarrier();
11564 RTUINT256U uDstOut;
11565 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11566 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11567 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11568 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11569 uDstOut.au8[ 4] = uSrc1.au8[10];
11570 uDstOut.au8[ 5] = uSrc2.au8[10];
11571 uDstOut.au8[ 6] = uSrc1.au8[11];
11572 uDstOut.au8[ 7] = uSrc2.au8[11];
11573 uDstOut.au8[ 8] = uSrc1.au8[12];
11574 uDstOut.au8[ 9] = uSrc2.au8[12];
11575 uDstOut.au8[10] = uSrc1.au8[13];
11576 uDstOut.au8[11] = uSrc2.au8[13];
11577 uDstOut.au8[12] = uSrc1.au8[14];
11578 uDstOut.au8[13] = uSrc2.au8[14];
11579 uDstOut.au8[14] = uSrc1.au8[15];
11580 uDstOut.au8[15] = uSrc2.au8[15];
11581 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11582 uDstOut.au8[16] = uSrc1.au8[24];
11583 uDstOut.au8[17] = uSrc2.au8[24];
11584 uDstOut.au8[18] = uSrc1.au8[25];
11585 uDstOut.au8[19] = uSrc2.au8[25];
11586 uDstOut.au8[20] = uSrc1.au8[26];
11587 uDstOut.au8[21] = uSrc2.au8[26];
11588 uDstOut.au8[22] = uSrc1.au8[27];
11589 uDstOut.au8[23] = uSrc2.au8[27];
11590 uDstOut.au8[24] = uSrc1.au8[28];
11591 uDstOut.au8[25] = uSrc2.au8[28];
11592 uDstOut.au8[26] = uSrc1.au8[29];
11593 uDstOut.au8[27] = uSrc2.au8[29];
11594 uDstOut.au8[28] = uSrc1.au8[30];
11595 uDstOut.au8[29] = uSrc2.au8[30];
11596 uDstOut.au8[30] = uSrc1.au8[31];
11597 uDstOut.au8[31] = uSrc2.au8[31];
11598 *puDst = uDstOut;
11599}
11600
11601
11602/*
11603 * PUNPCKHBW - high words -> dwords
11604 */
11605#ifdef IEM_WITHOUT_ASSEMBLY
11606
11607IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11608{
11609 RTUINT64U const uSrc2 = { *puSrc };
11610 RTUINT64U const uSrc1 = { *puDst };
11611 ASMCompilerBarrier();
11612 RTUINT64U uDstOut;
11613 uDstOut.au16[0] = uSrc1.au16[2];
11614 uDstOut.au16[1] = uSrc2.au16[2];
11615 uDstOut.au16[2] = uSrc1.au16[3];
11616 uDstOut.au16[3] = uSrc2.au16[3];
11617 *puDst = uDstOut.u;
11618}
11619
11620
11621IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11622{
11623 RTUINT128U const uSrc2 = *puSrc;
11624 RTUINT128U const uSrc1 = *puDst;
11625 ASMCompilerBarrier();
11626 RTUINT128U uDstOut;
11627 uDstOut.au16[0] = uSrc1.au16[4];
11628 uDstOut.au16[1] = uSrc2.au16[4];
11629 uDstOut.au16[2] = uSrc1.au16[5];
11630 uDstOut.au16[3] = uSrc2.au16[5];
11631 uDstOut.au16[4] = uSrc1.au16[6];
11632 uDstOut.au16[5] = uSrc2.au16[6];
11633 uDstOut.au16[6] = uSrc1.au16[7];
11634 uDstOut.au16[7] = uSrc2.au16[7];
11635 *puDst = uDstOut;
11636}
11637
11638#endif
11639
11640IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11641{
11642 RTUINT128U const uSrc2 = *puSrc2;
11643 RTUINT128U const uSrc1 = *puSrc1;
11644 ASMCompilerBarrier();
11645 RTUINT128U uDstOut;
11646 uDstOut.au16[0] = uSrc1.au16[4];
11647 uDstOut.au16[1] = uSrc2.au16[4];
11648 uDstOut.au16[2] = uSrc1.au16[5];
11649 uDstOut.au16[3] = uSrc2.au16[5];
11650 uDstOut.au16[4] = uSrc1.au16[6];
11651 uDstOut.au16[5] = uSrc2.au16[6];
11652 uDstOut.au16[6] = uSrc1.au16[7];
11653 uDstOut.au16[7] = uSrc2.au16[7];
11654 *puDst = uDstOut;
11655}
11656
11657
11658IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11659{
11660 RTUINT256U const uSrc2 = *puSrc2;
11661 RTUINT256U const uSrc1 = *puSrc1;
11662 ASMCompilerBarrier();
11663 RTUINT256U uDstOut;
11664 uDstOut.au16[0] = uSrc1.au16[4];
11665 uDstOut.au16[1] = uSrc2.au16[4];
11666 uDstOut.au16[2] = uSrc1.au16[5];
11667 uDstOut.au16[3] = uSrc2.au16[5];
11668 uDstOut.au16[4] = uSrc1.au16[6];
11669 uDstOut.au16[5] = uSrc2.au16[6];
11670 uDstOut.au16[6] = uSrc1.au16[7];
11671 uDstOut.au16[7] = uSrc2.au16[7];
11672
11673 uDstOut.au16[8] = uSrc1.au16[12];
11674 uDstOut.au16[9] = uSrc2.au16[12];
11675 uDstOut.au16[10] = uSrc1.au16[13];
11676 uDstOut.au16[11] = uSrc2.au16[13];
11677 uDstOut.au16[12] = uSrc1.au16[14];
11678 uDstOut.au16[13] = uSrc2.au16[14];
11679 uDstOut.au16[14] = uSrc1.au16[15];
11680 uDstOut.au16[15] = uSrc2.au16[15];
11681 *puDst = uDstOut;
11682}
11683
11684
11685/*
11686 * PUNPCKHBW - high dwords -> qword(s)
11687 */
11688#ifdef IEM_WITHOUT_ASSEMBLY
11689
11690IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11691{
11692 RTUINT64U const uSrc2 = { *puSrc };
11693 RTUINT64U const uSrc1 = { *puDst };
11694 ASMCompilerBarrier();
11695 RTUINT64U uDstOut;
11696 uDstOut.au32[0] = uSrc1.au32[1];
11697 uDstOut.au32[1] = uSrc2.au32[1];
11698 *puDst = uDstOut.u;
11699}
11700
11701
11702IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11703{
11704 RTUINT128U const uSrc2 = *puSrc;
11705 RTUINT128U const uSrc1 = *puDst;
11706 ASMCompilerBarrier();
11707 RTUINT128U uDstOut;
11708 uDstOut.au32[0] = uSrc1.au32[2];
11709 uDstOut.au32[1] = uSrc2.au32[2];
11710 uDstOut.au32[2] = uSrc1.au32[3];
11711 uDstOut.au32[3] = uSrc2.au32[3];
11712 *puDst = uDstOut;
11713}
11714
11715#endif
11716
11717IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11718{
11719 RTUINT128U const uSrc2 = *puSrc2;
11720 RTUINT128U const uSrc1 = *puSrc1;
11721 ASMCompilerBarrier();
11722 RTUINT128U uDstOut;
11723 uDstOut.au32[0] = uSrc1.au32[2];
11724 uDstOut.au32[1] = uSrc2.au32[2];
11725 uDstOut.au32[2] = uSrc1.au32[3];
11726 uDstOut.au32[3] = uSrc2.au32[3];
11727 *puDst = uDstOut;
11728}
11729
11730
11731IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11732{
11733 RTUINT256U const uSrc2 = *puSrc2;
11734 RTUINT256U const uSrc1 = *puSrc1;
11735 ASMCompilerBarrier();
11736 RTUINT256U uDstOut;
11737 uDstOut.au32[0] = uSrc1.au32[2];
11738 uDstOut.au32[1] = uSrc2.au32[2];
11739 uDstOut.au32[2] = uSrc1.au32[3];
11740 uDstOut.au32[3] = uSrc2.au32[3];
11741
11742 uDstOut.au32[4] = uSrc1.au32[6];
11743 uDstOut.au32[5] = uSrc2.au32[6];
11744 uDstOut.au32[6] = uSrc1.au32[7];
11745 uDstOut.au32[7] = uSrc2.au32[7];
11746 *puDst = uDstOut;
11747}
11748
11749
11750/*
11751 * PUNPCKHQDQ -> High qwords -> double qword(s).
11752 */
11753#ifdef IEM_WITHOUT_ASSEMBLY
11754IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11755{
11756 RTUINT128U const uSrc2 = *puSrc;
11757 RTUINT128U const uSrc1 = *puDst;
11758 ASMCompilerBarrier();
11759 RTUINT128U uDstOut;
11760 uDstOut.au64[0] = uSrc1.au64[1];
11761 uDstOut.au64[1] = uSrc2.au64[1];
11762 *puDst = uDstOut;
11763}
11764#endif
11765
11766
11767IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11768{
11769 RTUINT128U const uSrc2 = *puSrc2;
11770 RTUINT128U const uSrc1 = *puSrc1;
11771 ASMCompilerBarrier();
11772 RTUINT128U uDstOut;
11773 uDstOut.au64[0] = uSrc1.au64[1];
11774 uDstOut.au64[1] = uSrc2.au64[1];
11775 *puDst = uDstOut;
11776}
11777
11778
11779IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11780{
11781 RTUINT256U const uSrc2 = *puSrc2;
11782 RTUINT256U const uSrc1 = *puSrc1;
11783 ASMCompilerBarrier();
11784 RTUINT256U uDstOut;
11785 uDstOut.au64[0] = uSrc1.au64[1];
11786 uDstOut.au64[1] = uSrc2.au64[1];
11787
11788 uDstOut.au64[2] = uSrc1.au64[3];
11789 uDstOut.au64[3] = uSrc2.au64[3];
11790 *puDst = uDstOut;
11791}
11792
11793
11794/*
11795 * PUNPCKLBW - low bytes -> words
11796 */
11797#ifdef IEM_WITHOUT_ASSEMBLY
11798
11799IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11800{
11801 RTUINT64U const uSrc2 = { *puSrc };
11802 RTUINT64U const uSrc1 = { *puDst };
11803 ASMCompilerBarrier();
11804 RTUINT64U uDstOut;
11805 uDstOut.au8[0] = uSrc1.au8[0];
11806 uDstOut.au8[1] = uSrc2.au8[0];
11807 uDstOut.au8[2] = uSrc1.au8[1];
11808 uDstOut.au8[3] = uSrc2.au8[1];
11809 uDstOut.au8[4] = uSrc1.au8[2];
11810 uDstOut.au8[5] = uSrc2.au8[2];
11811 uDstOut.au8[6] = uSrc1.au8[3];
11812 uDstOut.au8[7] = uSrc2.au8[3];
11813 *puDst = uDstOut.u;
11814}
11815
11816
11817IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11818{
11819 RTUINT128U const uSrc2 = *puSrc;
11820 RTUINT128U const uSrc1 = *puDst;
11821 ASMCompilerBarrier();
11822 RTUINT128U uDstOut;
11823 uDstOut.au8[ 0] = uSrc1.au8[0];
11824 uDstOut.au8[ 1] = uSrc2.au8[0];
11825 uDstOut.au8[ 2] = uSrc1.au8[1];
11826 uDstOut.au8[ 3] = uSrc2.au8[1];
11827 uDstOut.au8[ 4] = uSrc1.au8[2];
11828 uDstOut.au8[ 5] = uSrc2.au8[2];
11829 uDstOut.au8[ 6] = uSrc1.au8[3];
11830 uDstOut.au8[ 7] = uSrc2.au8[3];
11831 uDstOut.au8[ 8] = uSrc1.au8[4];
11832 uDstOut.au8[ 9] = uSrc2.au8[4];
11833 uDstOut.au8[10] = uSrc1.au8[5];
11834 uDstOut.au8[11] = uSrc2.au8[5];
11835 uDstOut.au8[12] = uSrc1.au8[6];
11836 uDstOut.au8[13] = uSrc2.au8[6];
11837 uDstOut.au8[14] = uSrc1.au8[7];
11838 uDstOut.au8[15] = uSrc2.au8[7];
11839 *puDst = uDstOut;
11840}
11841
11842#endif
11843
11844IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11845{
11846 RTUINT128U const uSrc2 = *puSrc2;
11847 RTUINT128U const uSrc1 = *puSrc1;
11848 ASMCompilerBarrier();
11849 RTUINT128U uDstOut;
11850 uDstOut.au8[ 0] = uSrc1.au8[0];
11851 uDstOut.au8[ 1] = uSrc2.au8[0];
11852 uDstOut.au8[ 2] = uSrc1.au8[1];
11853 uDstOut.au8[ 3] = uSrc2.au8[1];
11854 uDstOut.au8[ 4] = uSrc1.au8[2];
11855 uDstOut.au8[ 5] = uSrc2.au8[2];
11856 uDstOut.au8[ 6] = uSrc1.au8[3];
11857 uDstOut.au8[ 7] = uSrc2.au8[3];
11858 uDstOut.au8[ 8] = uSrc1.au8[4];
11859 uDstOut.au8[ 9] = uSrc2.au8[4];
11860 uDstOut.au8[10] = uSrc1.au8[5];
11861 uDstOut.au8[11] = uSrc2.au8[5];
11862 uDstOut.au8[12] = uSrc1.au8[6];
11863 uDstOut.au8[13] = uSrc2.au8[6];
11864 uDstOut.au8[14] = uSrc1.au8[7];
11865 uDstOut.au8[15] = uSrc2.au8[7];
11866 *puDst = uDstOut;
11867}
11868
11869
11870IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11871{
11872 RTUINT256U const uSrc2 = *puSrc2;
11873 RTUINT256U const uSrc1 = *puSrc1;
11874 ASMCompilerBarrier();
11875 RTUINT256U uDstOut;
11876 uDstOut.au8[ 0] = uSrc1.au8[0];
11877 uDstOut.au8[ 1] = uSrc2.au8[0];
11878 uDstOut.au8[ 2] = uSrc1.au8[1];
11879 uDstOut.au8[ 3] = uSrc2.au8[1];
11880 uDstOut.au8[ 4] = uSrc1.au8[2];
11881 uDstOut.au8[ 5] = uSrc2.au8[2];
11882 uDstOut.au8[ 6] = uSrc1.au8[3];
11883 uDstOut.au8[ 7] = uSrc2.au8[3];
11884 uDstOut.au8[ 8] = uSrc1.au8[4];
11885 uDstOut.au8[ 9] = uSrc2.au8[4];
11886 uDstOut.au8[10] = uSrc1.au8[5];
11887 uDstOut.au8[11] = uSrc2.au8[5];
11888 uDstOut.au8[12] = uSrc1.au8[6];
11889 uDstOut.au8[13] = uSrc2.au8[6];
11890 uDstOut.au8[14] = uSrc1.au8[7];
11891 uDstOut.au8[15] = uSrc2.au8[7];
11892 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11893 uDstOut.au8[16] = uSrc1.au8[16];
11894 uDstOut.au8[17] = uSrc2.au8[16];
11895 uDstOut.au8[18] = uSrc1.au8[17];
11896 uDstOut.au8[19] = uSrc2.au8[17];
11897 uDstOut.au8[20] = uSrc1.au8[18];
11898 uDstOut.au8[21] = uSrc2.au8[18];
11899 uDstOut.au8[22] = uSrc1.au8[19];
11900 uDstOut.au8[23] = uSrc2.au8[19];
11901 uDstOut.au8[24] = uSrc1.au8[20];
11902 uDstOut.au8[25] = uSrc2.au8[20];
11903 uDstOut.au8[26] = uSrc1.au8[21];
11904 uDstOut.au8[27] = uSrc2.au8[21];
11905 uDstOut.au8[28] = uSrc1.au8[22];
11906 uDstOut.au8[29] = uSrc2.au8[22];
11907 uDstOut.au8[30] = uSrc1.au8[23];
11908 uDstOut.au8[31] = uSrc2.au8[23];
11909 *puDst = uDstOut;
11910}
11911
11912
11913/*
11914 * PUNPCKLBW - low words -> dwords
11915 */
11916#ifdef IEM_WITHOUT_ASSEMBLY
11917
11918IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11919{
11920 RTUINT64U const uSrc2 = { *puSrc };
11921 RTUINT64U const uSrc1 = { *puDst };
11922 ASMCompilerBarrier();
11923 RTUINT64U uDstOut;
11924 uDstOut.au16[0] = uSrc1.au16[0];
11925 uDstOut.au16[1] = uSrc2.au16[0];
11926 uDstOut.au16[2] = uSrc1.au16[1];
11927 uDstOut.au16[3] = uSrc2.au16[1];
11928 *puDst = uDstOut.u;
11929}
11930
11931
11932IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11933{
11934 RTUINT128U const uSrc2 = *puSrc;
11935 RTUINT128U const uSrc1 = *puDst;
11936 ASMCompilerBarrier();
11937 RTUINT128U uDstOut;
11938 uDstOut.au16[0] = uSrc1.au16[0];
11939 uDstOut.au16[1] = uSrc2.au16[0];
11940 uDstOut.au16[2] = uSrc1.au16[1];
11941 uDstOut.au16[3] = uSrc2.au16[1];
11942 uDstOut.au16[4] = uSrc1.au16[2];
11943 uDstOut.au16[5] = uSrc2.au16[2];
11944 uDstOut.au16[6] = uSrc1.au16[3];
11945 uDstOut.au16[7] = uSrc2.au16[3];
11946 *puDst = uDstOut;
11947}
11948
11949#endif
11950
11951IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11952{
11953 RTUINT128U const uSrc2 = *puSrc2;
11954 RTUINT128U const uSrc1 = *puSrc1;
11955 ASMCompilerBarrier();
11956 RTUINT128U uDstOut;
11957 uDstOut.au16[0] = uSrc1.au16[0];
11958 uDstOut.au16[1] = uSrc2.au16[0];
11959 uDstOut.au16[2] = uSrc1.au16[1];
11960 uDstOut.au16[3] = uSrc2.au16[1];
11961 uDstOut.au16[4] = uSrc1.au16[2];
11962 uDstOut.au16[5] = uSrc2.au16[2];
11963 uDstOut.au16[6] = uSrc1.au16[3];
11964 uDstOut.au16[7] = uSrc2.au16[3];
11965 *puDst = uDstOut;
11966}
11967
11968
11969IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11970{
11971 RTUINT256U const uSrc2 = *puSrc2;
11972 RTUINT256U const uSrc1 = *puSrc1;
11973 ASMCompilerBarrier();
11974 RTUINT256U uDstOut;
11975 uDstOut.au16[0] = uSrc1.au16[0];
11976 uDstOut.au16[1] = uSrc2.au16[0];
11977 uDstOut.au16[2] = uSrc1.au16[1];
11978 uDstOut.au16[3] = uSrc2.au16[1];
11979 uDstOut.au16[4] = uSrc1.au16[2];
11980 uDstOut.au16[5] = uSrc2.au16[2];
11981 uDstOut.au16[6] = uSrc1.au16[3];
11982 uDstOut.au16[7] = uSrc2.au16[3];
11983
11984 uDstOut.au16[8] = uSrc1.au16[8];
11985 uDstOut.au16[9] = uSrc2.au16[8];
11986 uDstOut.au16[10] = uSrc1.au16[9];
11987 uDstOut.au16[11] = uSrc2.au16[9];
11988 uDstOut.au16[12] = uSrc1.au16[10];
11989 uDstOut.au16[13] = uSrc2.au16[10];
11990 uDstOut.au16[14] = uSrc1.au16[11];
11991 uDstOut.au16[15] = uSrc2.au16[11];
11992 *puDst = uDstOut;
11993}
11994
11995
11996/*
11997 * PUNPCKLBW - low dwords -> qword(s)
11998 */
11999#ifdef IEM_WITHOUT_ASSEMBLY
12000
12001IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12002{
12003 RTUINT64U const uSrc2 = { *puSrc };
12004 RTUINT64U const uSrc1 = { *puDst };
12005 ASMCompilerBarrier();
12006 RTUINT64U uDstOut;
12007 uDstOut.au32[0] = uSrc1.au32[0];
12008 uDstOut.au32[1] = uSrc2.au32[0];
12009 *puDst = uDstOut.u;
12010}
12011
12012
12013IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12014{
12015 RTUINT128U const uSrc2 = *puSrc;
12016 RTUINT128U const uSrc1 = *puDst;
12017 ASMCompilerBarrier();
12018 RTUINT128U uDstOut;
12019 uDstOut.au32[0] = uSrc1.au32[0];
12020 uDstOut.au32[1] = uSrc2.au32[0];
12021 uDstOut.au32[2] = uSrc1.au32[1];
12022 uDstOut.au32[3] = uSrc2.au32[1];
12023 *puDst = uDstOut;
12024}
12025
12026#endif
12027
12028IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12029{
12030 RTUINT128U const uSrc2 = *puSrc2;
12031 RTUINT128U const uSrc1 = *puSrc1;
12032 ASMCompilerBarrier();
12033 RTUINT128U uDstOut;
12034 uDstOut.au32[0] = uSrc1.au32[0];
12035 uDstOut.au32[1] = uSrc2.au32[0];
12036 uDstOut.au32[2] = uSrc1.au32[1];
12037 uDstOut.au32[3] = uSrc2.au32[1];
12038 *puDst = uDstOut;
12039}
12040
12041
12042IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12043{
12044 RTUINT256U const uSrc2 = *puSrc2;
12045 RTUINT256U const uSrc1 = *puSrc1;
12046 ASMCompilerBarrier();
12047 RTUINT256U uDstOut;
12048 uDstOut.au32[0] = uSrc1.au32[0];
12049 uDstOut.au32[1] = uSrc2.au32[0];
12050 uDstOut.au32[2] = uSrc1.au32[1];
12051 uDstOut.au32[3] = uSrc2.au32[1];
12052
12053 uDstOut.au32[4] = uSrc1.au32[4];
12054 uDstOut.au32[5] = uSrc2.au32[4];
12055 uDstOut.au32[6] = uSrc1.au32[5];
12056 uDstOut.au32[7] = uSrc2.au32[5];
12057 *puDst = uDstOut;
12058}
12059
12060
12061/*
12062 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12063 */
12064#ifdef IEM_WITHOUT_ASSEMBLY
12065IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12066{
12067 RTUINT128U const uSrc2 = *puSrc;
12068 RTUINT128U const uSrc1 = *puDst;
12069 ASMCompilerBarrier();
12070 RTUINT128U uDstOut;
12071 uDstOut.au64[0] = uSrc1.au64[0];
12072 uDstOut.au64[1] = uSrc2.au64[0];
12073 *puDst = uDstOut;
12074}
12075#endif
12076
12077
12078IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12079{
12080 RTUINT128U const uSrc2 = *puSrc2;
12081 RTUINT128U const uSrc1 = *puSrc1;
12082 ASMCompilerBarrier();
12083 RTUINT128U uDstOut;
12084 uDstOut.au64[0] = uSrc1.au64[0];
12085 uDstOut.au64[1] = uSrc2.au64[0];
12086 *puDst = uDstOut;
12087}
12088
12089
12090IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12091{
12092 RTUINT256U const uSrc2 = *puSrc2;
12093 RTUINT256U const uSrc1 = *puSrc1;
12094 ASMCompilerBarrier();
12095 RTUINT256U uDstOut;
12096 uDstOut.au64[0] = uSrc1.au64[0];
12097 uDstOut.au64[1] = uSrc2.au64[0];
12098
12099 uDstOut.au64[2] = uSrc1.au64[2];
12100 uDstOut.au64[3] = uSrc2.au64[2];
12101 *puDst = uDstOut;
12102}
12103
12104
12105/*
12106 * PACKSSWB - signed words -> signed bytes
12107 */
12108
12109#ifdef IEM_WITHOUT_ASSEMBLY
12110
12111IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12112{
12113 RTUINT64U const uSrc2 = { *puSrc };
12114 RTUINT64U const uSrc1 = { *puDst };
12115 ASMCompilerBarrier();
12116 RTUINT64U uDstOut;
12117 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12118 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12119 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12120 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12121 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12122 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12123 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12124 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12125 *puDst = uDstOut.u;
12126}
12127
12128
12129IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12130{
12131 RTUINT128U const uSrc2 = *puSrc;
12132 RTUINT128U const uSrc1 = *puDst;
12133 ASMCompilerBarrier();
12134 RTUINT128U uDstOut;
12135 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12136 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12137 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12138 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12139 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12140 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12141 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12142 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12143 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12144 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12145 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12146 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12147 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12148 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12149 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12150 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12151 *puDst = uDstOut;
12152}
12153
12154#endif
12155
12156IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12157{
12158 RTUINT128U const uSrc2 = *puSrc2;
12159 RTUINT128U const uSrc1 = *puSrc1;
12160 ASMCompilerBarrier();
12161 RTUINT128U uDstOut;
12162 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12163 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12164 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12165 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12166 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12167 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12168 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12169 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12170 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12171 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12172 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12173 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12174 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12175 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12176 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12177 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12178 *puDst = uDstOut;
12179}
12180
12181
12182IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12183{
12184 RTUINT256U const uSrc2 = *puSrc2;
12185 RTUINT256U const uSrc1 = *puSrc1;
12186 ASMCompilerBarrier();
12187 RTUINT256U uDstOut;
12188 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12189 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12190 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12191 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12192 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12193 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12194 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12195 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12196 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12197 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12198 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12199 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12200 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12201 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12202 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12203 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12204
12205 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12206 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12207 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12208 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12209 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12210 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12211 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12212 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12213 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12214 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12215 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12216 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12217 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12218 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12219 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12220 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12221 *puDst = uDstOut;
12222}
12223
12224
12225/*
12226 * PACKUSWB - signed words -> unsigned bytes
12227 */
12228#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12229 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12230 ? (uint8_t)(a_iWord) \
12231 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12232
12233#ifdef IEM_WITHOUT_ASSEMBLY
12234
12235IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12236{
12237 RTUINT64U const uSrc2 = { *puSrc };
12238 RTUINT64U const uSrc1 = { *puDst };
12239 ASMCompilerBarrier();
12240 RTUINT64U uDstOut;
12241 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12242 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12243 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12244 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12245 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12246 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12247 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12248 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12249 *puDst = uDstOut.u;
12250}
12251
12252
12253IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12254{
12255 RTUINT128U const uSrc2 = *puSrc;
12256 RTUINT128U const uSrc1 = *puDst;
12257 ASMCompilerBarrier();
12258 RTUINT128U uDstOut;
12259 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12260 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12261 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12262 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12263 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12264 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12265 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12266 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12267 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12268 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12269 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12270 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12271 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12272 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12273 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12274 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12275 *puDst = uDstOut;
12276}
12277
12278#endif
12279
12280IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12281{
12282 RTUINT128U const uSrc2 = *puSrc2;
12283 RTUINT128U const uSrc1 = *puSrc1;
12284 ASMCompilerBarrier();
12285 RTUINT128U uDstOut;
12286 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12287 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12288 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12289 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12290 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12291 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12292 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12293 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12294 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12295 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12296 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12297 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12298 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12299 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12300 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12301 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12302 *puDst = uDstOut;
12303}
12304
12305
12306IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12307{
12308 RTUINT256U const uSrc2 = *puSrc2;
12309 RTUINT256U const uSrc1 = *puSrc1;
12310 ASMCompilerBarrier();
12311 RTUINT256U uDstOut;
12312 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12313 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12314 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12315 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12316 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12317 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12318 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12319 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12320 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12321 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12322 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12323 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12324 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12325 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12326 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12327 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12328
12329 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12330 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12331 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12332 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12333 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12334 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12335 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12336 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12337 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12338 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12339 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12340 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12341 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12342 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12343 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12344 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12345 *puDst = uDstOut;
12346}
12347
12348
12349/*
12350 * PACKSSDW - signed dwords -> signed words
12351 */
12352
12353#ifdef IEM_WITHOUT_ASSEMBLY
12354
12355IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12356{
12357 RTUINT64U const uSrc2 = { *puSrc };
12358 RTUINT64U const uSrc1 = { *puDst };
12359 ASMCompilerBarrier();
12360 RTUINT64U uDstOut;
12361 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12362 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12363 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12364 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12365 *puDst = uDstOut.u;
12366}
12367
12368
12369IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12370{
12371 RTUINT128U const uSrc2 = *puSrc;
12372 RTUINT128U const uSrc1 = *puDst;
12373 ASMCompilerBarrier();
12374 RTUINT128U uDstOut;
12375 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12376 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12377 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12378 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12379 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12380 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12381 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12382 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12383 *puDst = uDstOut;
12384}
12385
12386#endif
12387
12388IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12389{
12390 RTUINT128U const uSrc2 = *puSrc2;
12391 RTUINT128U const uSrc1 = *puSrc1;
12392 ASMCompilerBarrier();
12393 RTUINT128U uDstOut;
12394 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12395 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12396 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12397 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12398 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12399 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12400 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12401 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12402 *puDst = uDstOut;
12403}
12404
12405
12406IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12407{
12408 RTUINT256U const uSrc2 = *puSrc2;
12409 RTUINT256U const uSrc1 = *puSrc1;
12410 ASMCompilerBarrier();
12411 RTUINT256U uDstOut;
12412 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12413 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12414 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12415 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12416 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12417 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12418 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12419 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12420
12421 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12422 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12423 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12424 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12425 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12426 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12427 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12428 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12429 *puDst = uDstOut;
12430}
12431
12432
12433/*
12434 * PACKUSDW - signed dwords -> unsigned words
12435 */
12436#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12437 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12438 ? (uint16_t)(a_iDword) \
12439 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12440
12441#ifdef IEM_WITHOUT_ASSEMBLY
12442IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12443{
12444 RTUINT128U const uSrc2 = *puSrc;
12445 RTUINT128U const uSrc1 = *puDst;
12446 ASMCompilerBarrier();
12447 RTUINT128U uDstOut;
12448 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12449 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12450 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12451 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12452 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12453 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12454 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12455 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12456 *puDst = uDstOut;
12457}
12458#endif
12459
12460IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12461{
12462 RTUINT128U const uSrc2 = *puSrc2;
12463 RTUINT128U const uSrc1 = *puSrc1;
12464 ASMCompilerBarrier();
12465 RTUINT128U uDstOut;
12466 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12467 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12468 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12469 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12470 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12471 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12472 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12473 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12474 *puDst = uDstOut;
12475}
12476
12477
12478IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12479{
12480 RTUINT256U const uSrc2 = *puSrc2;
12481 RTUINT256U const uSrc1 = *puSrc1;
12482 ASMCompilerBarrier();
12483 RTUINT256U uDstOut;
12484 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12485 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12486 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12487 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12488 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12489 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12490 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12491 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12492
12493 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12494 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12495 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12496 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12497 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12498 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12499 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12500 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12501 *puDst = uDstOut;
12502}
12503
12504
12505/*
12506 * [V]PABSB / [V]PABSW / [V]PABSD
12507 */
12508
12509IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12510{
12511 RTUINT64U const uSrc = { *puSrc };
12512 RTUINT64U uDstOut = { 0 };
12513
12514 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12515 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12516 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12517 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12518 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12519 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12520 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12521 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12522 *puDst = uDstOut.u;
12523 RT_NOREF(pFpuState);
12524}
12525
12526
12527IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12528{
12529 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12530 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12531 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12532 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12533 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12534 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12535 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12536 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12537 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12538 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12539 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12540 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12541 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12542 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12543 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12544 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12545 RT_NOREF(pFpuState);
12546}
12547
12548
12549IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12550{
12551 RTUINT64U const uSrc = { *puSrc };
12552 RTUINT64U uDstOut = { 0 };
12553
12554 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12555 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12556 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12557 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12558 *puDst = uDstOut.u;
12559 RT_NOREF(pFpuState);
12560}
12561
12562
12563IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12564{
12565 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12566 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12567 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12568 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12569 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12570 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12571 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12572 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12573 RT_NOREF(pFpuState);
12574}
12575
12576
12577IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12578{
12579 RTUINT64U const uSrc = { *puSrc };
12580 RTUINT64U uDstOut = { 0 };
12581
12582 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12583 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12584 *puDst = uDstOut.u;
12585 RT_NOREF(pFpuState);
12586}
12587
12588
12589IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12590{
12591 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12592 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12593 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12594 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12595 RT_NOREF(pFpuState);
12596}
12597
12598
12599IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12600{
12601 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12602 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12603 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12604 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12605 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12606 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12607 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12608 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12609 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12610 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12611 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12612 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12613 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12614 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12615 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12616 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12617}
12618
12619
12620IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12621{
12622 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12623 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12624 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12625 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12626 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12627 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12628 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12629 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12630 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12631 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12632 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12633 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12634 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12635 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12636 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12637 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12638 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12639 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12640 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12641 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12642 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12643 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12644 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12645 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12646 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12647 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12648 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12649 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12650 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12651 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12652 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12653 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12654}
12655
12656
12657IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12658{
12659 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12660 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12661 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12662 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12663 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12664 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12665 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12666 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12667}
12668
12669
12670IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12671{
12672 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12673 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12674 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12675 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12676 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12677 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12678 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12679 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12680 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12681 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12682 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12683 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12684 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12685 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12686 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12687 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12688}
12689
12690
12691IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12692{
12693 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12694 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12695 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12696 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12697}
12698
12699
12700IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12701{
12702 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12703 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12704 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12705 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12706 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12707 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12708 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12709 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12710}
12711
12712
12713/*
12714 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12715 */
12716IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12717{
12718 RTUINT64U uSrc1 = { *puDst };
12719 RTUINT64U uSrc2 = { *puSrc };
12720 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12721
12722 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12723 {
12724 if (uSrc2.ai8[i] < 0)
12725 uDst.ai8[i] = -uSrc1.ai8[i];
12726 else if (uSrc2.ai8[i] == 0)
12727 uDst.ai8[i] = 0;
12728 else /* uSrc2.ai8[i] > 0 */
12729 uDst.ai8[i] = uSrc1.ai8[i];
12730 }
12731
12732 *puDst = uDst.u;
12733 RT_NOREF(pFpuState);
12734}
12735
12736
12737IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12738{
12739 RTUINT128U uSrc1 = *puDst;
12740
12741 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12742 {
12743 if (puSrc->ai8[i] < 0)
12744 puDst->ai8[i] = -uSrc1.ai8[i];
12745 else if (puSrc->ai8[i] == 0)
12746 puDst->ai8[i] = 0;
12747 else /* puSrc->ai8[i] > 0 */
12748 puDst->ai8[i] = uSrc1.ai8[i];
12749 }
12750
12751 RT_NOREF(pFpuState);
12752}
12753
12754
12755IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12756{
12757 RTUINT64U uSrc1 = { *puDst };
12758 RTUINT64U uSrc2 = { *puSrc };
12759 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12760
12761 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12762 {
12763 if (uSrc2.ai16[i] < 0)
12764 uDst.ai16[i] = -uSrc1.ai16[i];
12765 else if (uSrc2.ai16[i] == 0)
12766 uDst.ai16[i] = 0;
12767 else /* uSrc2.ai16[i] > 0 */
12768 uDst.ai16[i] = uSrc1.ai16[i];
12769 }
12770
12771 *puDst = uDst.u;
12772 RT_NOREF(pFpuState);
12773}
12774
12775
12776IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12777{
12778 RTUINT128U uSrc1 = *puDst;
12779
12780 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12781 {
12782 if (puSrc->ai16[i] < 0)
12783 puDst->ai16[i] = -uSrc1.ai16[i];
12784 else if (puSrc->ai16[i] == 0)
12785 puDst->ai16[i] = 0;
12786 else /* puSrc->ai16[i] > 0 */
12787 puDst->ai16[i] = uSrc1.ai16[i];
12788 }
12789
12790 RT_NOREF(pFpuState);
12791}
12792
12793
12794IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12795{
12796 RTUINT64U uSrc1 = { *puDst };
12797 RTUINT64U uSrc2 = { *puSrc };
12798 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12799
12800 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12801 {
12802 if (uSrc2.ai32[i] < 0)
12803 uDst.ai32[i] = -uSrc1.ai32[i];
12804 else if (uSrc2.ai32[i] == 0)
12805 uDst.ai32[i] = 0;
12806 else /* uSrc2.ai32[i] > 0 */
12807 uDst.ai32[i] = uSrc1.ai32[i];
12808 }
12809
12810 *puDst = uDst.u;
12811 RT_NOREF(pFpuState);
12812}
12813
12814
12815IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12816{
12817 RTUINT128U uSrc1 = *puDst;
12818
12819 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12820 {
12821 if (puSrc->ai32[i] < 0)
12822 puDst->ai32[i] = -uSrc1.ai32[i];
12823 else if (puSrc->ai32[i] == 0)
12824 puDst->ai32[i] = 0;
12825 else /* puSrc->ai32[i] > 0 */
12826 puDst->ai32[i] = uSrc1.ai32[i];
12827 }
12828
12829 RT_NOREF(pFpuState);
12830}
12831
12832
12833IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12834{
12835 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12836 {
12837 if (puSrc2->ai8[i] < 0)
12838 puDst->ai8[i] = -puSrc1->ai8[i];
12839 else if (puSrc2->ai8[i] == 0)
12840 puDst->ai8[i] = 0;
12841 else /* puSrc2->ai8[i] > 0 */
12842 puDst->ai8[i] = puSrc1->ai8[i];
12843 }
12844}
12845
12846
12847IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12848{
12849 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12850 {
12851 if (puSrc2->ai8[i] < 0)
12852 puDst->ai8[i] = -puSrc1->ai8[i];
12853 else if (puSrc2->ai8[i] == 0)
12854 puDst->ai8[i] = 0;
12855 else /* puSrc2->ai8[i] > 0 */
12856 puDst->ai8[i] = puSrc1->ai8[i];
12857 }
12858}
12859
12860
12861IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12862{
12863 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12864 {
12865 if (puSrc2->ai16[i] < 0)
12866 puDst->ai16[i] = -puSrc1->ai16[i];
12867 else if (puSrc2->ai16[i] == 0)
12868 puDst->ai16[i] = 0;
12869 else /* puSrc2->ai16[i] > 0 */
12870 puDst->ai16[i] = puSrc1->ai16[i];
12871 }
12872}
12873
12874
12875IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12876{
12877 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12878 {
12879 if (puSrc2->ai16[i] < 0)
12880 puDst->ai16[i] = -puSrc1->ai16[i];
12881 else if (puSrc2->ai16[i] == 0)
12882 puDst->ai16[i] = 0;
12883 else /* puSrc2->ai16[i] > 0 */
12884 puDst->ai16[i] = puSrc1->ai16[i];
12885 }
12886}
12887
12888
12889IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12890{
12891 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12892 {
12893 if (puSrc2->ai32[i] < 0)
12894 puDst->ai32[i] = -puSrc1->ai32[i];
12895 else if (puSrc2->ai32[i] == 0)
12896 puDst->ai32[i] = 0;
12897 else /* puSrc2->ai32[i] > 0 */
12898 puDst->ai32[i] = puSrc1->ai32[i];
12899 }
12900}
12901
12902
12903IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12904{
12905 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12906 {
12907 if (puSrc2->ai32[i] < 0)
12908 puDst->ai32[i] = -puSrc1->ai32[i];
12909 else if (puSrc2->ai32[i] == 0)
12910 puDst->ai32[i] = 0;
12911 else /* puSrc2->ai32[i] > 0 */
12912 puDst->ai32[i] = puSrc1->ai32[i];
12913 }
12914}
12915
12916
12917/*
12918 * PHADDW / VPHADDW / PHADDD / VPHADDD
12919 */
12920IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12921{
12922 RTUINT64U uSrc1 = { *puDst };
12923 RTUINT64U uSrc2 = { *puSrc };
12924 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12925
12926 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12927 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12928 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
12929 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
12930 *puDst = uDst.u;
12931 RT_NOREF(pFpuState);
12932}
12933
12934
12935IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12936{
12937 RTUINT128U uSrc1 = *puDst;
12938
12939 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
12940 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
12941 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
12942 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
12943
12944 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
12945 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
12946 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
12947 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
12948 RT_NOREF(pFpuState);
12949}
12950
12951
12952IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12953{
12954 RTUINT64U uSrc1 = { *puDst };
12955 RTUINT64U uSrc2 = { *puSrc };
12956 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12957
12958 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12959 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
12960 *puDst = uDst.u;
12961 RT_NOREF(pFpuState);
12962}
12963
12964
12965IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12966{
12967 RTUINT128U uSrc1 = *puDst;
12968
12969 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
12970 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
12971
12972 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
12973 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
12974 RT_NOREF(pFpuState);
12975}
12976
12977
12978IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12979{
12980 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
12981
12982 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
12983 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
12984 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
12985 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
12986
12987 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
12988 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
12989 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
12990 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
12991
12992 puDst->au64[0] = uDst.au64[0];
12993 puDst->au64[1] = uDst.au64[1];
12994}
12995
12996
12997IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12998{
12999 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13000
13001 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13002 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13003 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13004 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13005 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13006 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13007 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13008 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13009
13010 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13011 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13012 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13013 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13014 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13015 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13016 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13017 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13018
13019 puDst->au64[0] = uDst.au64[0];
13020 puDst->au64[1] = uDst.au64[1];
13021 puDst->au64[2] = uDst.au64[2];
13022 puDst->au64[3] = uDst.au64[3];
13023}
13024
13025
13026IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13027{
13028 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13029
13030 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13031 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13032
13033 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13034 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13035
13036 puDst->au64[0] = uDst.au64[0];
13037 puDst->au64[1] = uDst.au64[1];
13038}
13039
13040
13041IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13042{
13043 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13044
13045 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13046 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13047 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13048 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13049
13050 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13051 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13052 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13053 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13054
13055 puDst->au64[0] = uDst.au64[0];
13056 puDst->au64[1] = uDst.au64[1];
13057 puDst->au64[2] = uDst.au64[2];
13058 puDst->au64[3] = uDst.au64[3];
13059}
13060
13061
13062/*
13063 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13064 */
13065IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13066{
13067 RTUINT64U uSrc1 = { *puDst };
13068 RTUINT64U uSrc2 = { *puSrc };
13069 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13070
13071 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13072 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13073 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13074 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13075 *puDst = uDst.u;
13076 RT_NOREF(pFpuState);
13077}
13078
13079
13080IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13081{
13082 RTUINT128U uSrc1 = *puDst;
13083
13084 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13085 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13086 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13087 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13088
13089 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13090 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13091 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13092 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13093 RT_NOREF(pFpuState);
13094}
13095
13096
13097IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13098{
13099 RTUINT64U uSrc1 = { *puDst };
13100 RTUINT64U uSrc2 = { *puSrc };
13101 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13102
13103 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13104 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13105 *puDst = uDst.u;
13106 RT_NOREF(pFpuState);
13107}
13108
13109
13110IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13111{
13112 RTUINT128U uSrc1 = *puDst;
13113
13114 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13115 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13116
13117 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13118 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13119 RT_NOREF(pFpuState);
13120}
13121
13122
13123IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13124{
13125 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13126
13127 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13128 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13129 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13130 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13131
13132 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13133 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13134 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13135 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13136
13137 puDst->au64[0] = uDst.au64[0];
13138 puDst->au64[1] = uDst.au64[1];
13139}
13140
13141
13142IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13143{
13144 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13145
13146 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13147 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13148 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13149 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13150 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13151 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13152 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13153 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13154
13155 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13156 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13157 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13158 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13159 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13160 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13161 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13162 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13163
13164 puDst->au64[0] = uDst.au64[0];
13165 puDst->au64[1] = uDst.au64[1];
13166 puDst->au64[2] = uDst.au64[2];
13167 puDst->au64[3] = uDst.au64[3];
13168}
13169
13170
13171IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13172{
13173 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13174
13175 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13176 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13177
13178 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13179 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13180
13181 puDst->au64[0] = uDst.au64[0];
13182 puDst->au64[1] = uDst.au64[1];
13183}
13184
13185
13186IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13187{
13188 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13189
13190 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13191 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13192 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13193 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13194
13195 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13196 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13197 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13198 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13199
13200 puDst->au64[0] = uDst.au64[0];
13201 puDst->au64[1] = uDst.au64[1];
13202 puDst->au64[2] = uDst.au64[2];
13203 puDst->au64[3] = uDst.au64[3];
13204}
13205
13206
13207/*
13208 * PHADDSW / VPHADDSW
13209 */
13210IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13211{
13212 RTUINT64U uSrc1 = { *puDst };
13213 RTUINT64U uSrc2 = { *puSrc };
13214 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13215
13216 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13217 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13218 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13219 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13220 *puDst = uDst.u;
13221 RT_NOREF(pFpuState);
13222}
13223
13224
13225IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13226{
13227 RTUINT128U uSrc1 = *puDst;
13228
13229 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13230 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13231 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13232 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13233
13234 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13235 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13236 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13237 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13238 RT_NOREF(pFpuState);
13239}
13240
13241
13242IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13243{
13244 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13245
13246 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13247 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13248 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13249 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13250
13251 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13252 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13253 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13254 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13255
13256 puDst->au64[0] = uDst.au64[0];
13257 puDst->au64[1] = uDst.au64[1];
13258}
13259
13260
13261IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13262{
13263 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13264
13265 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13266 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13267 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13268 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13269 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13270 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13271 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13272 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13273
13274 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13275 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13276 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13277 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13278 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13279 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13280 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13281 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13282
13283 puDst->au64[0] = uDst.au64[0];
13284 puDst->au64[1] = uDst.au64[1];
13285 puDst->au64[2] = uDst.au64[2];
13286 puDst->au64[3] = uDst.au64[3];
13287}
13288
13289
13290/*
13291 * PHSUBSW / VPHSUBSW
13292 */
13293IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13294{
13295 RTUINT64U uSrc1 = { *puDst };
13296 RTUINT64U uSrc2 = { *puSrc };
13297 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13298
13299 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13300 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13301 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13302 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13303 *puDst = uDst.u;
13304 RT_NOREF(pFpuState);
13305}
13306
13307
13308IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13309{
13310 RTUINT128U uSrc1 = *puDst;
13311
13312 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13313 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13314 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13315 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13316
13317 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13318 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13319 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13320 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13321 RT_NOREF(pFpuState);
13322}
13323
13324
13325IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13326{
13327 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13328
13329 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13330 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13331 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13332 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13333
13334 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13335 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13336 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13337 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13338
13339 puDst->au64[0] = uDst.au64[0];
13340 puDst->au64[1] = uDst.au64[1];
13341}
13342
13343
13344IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13345{
13346 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13347
13348 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13349 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13350 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13351 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13352 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13353 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13354 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13355 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13356
13357 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13358 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13359 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13360 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13361 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13362 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13363 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13364 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13365
13366 puDst->au64[0] = uDst.au64[0];
13367 puDst->au64[1] = uDst.au64[1];
13368 puDst->au64[2] = uDst.au64[2];
13369 puDst->au64[3] = uDst.au64[3];
13370}
13371
13372
13373/*
13374 * PMADDUBSW / VPMADDUBSW
13375 */
13376IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13377{
13378 RTUINT64U uSrc1 = { *puDst };
13379 RTUINT64U uSrc2 = { *puSrc };
13380 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13381
13382 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13383 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13384 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13385 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13386 *puDst = uDst.u;
13387 RT_NOREF(pFpuState);
13388}
13389
13390
13391IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13392{
13393 RTUINT128U uSrc1 = *puDst;
13394
13395 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13396 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13397 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13398 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13399 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13400 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13401 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13402 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13403 RT_NOREF(pFpuState);
13404}
13405
13406
13407IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13408{
13409 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13410
13411 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13412 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13413 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13414 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13415 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13416 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13417 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13418 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13419
13420 puDst->au64[0] = uDst.au64[0];
13421 puDst->au64[1] = uDst.au64[1];
13422}
13423
13424
13425IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13426{
13427 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13428
13429 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13430 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13431 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13432 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13433 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13434 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13435 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13436 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13437 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13438 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13439 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13440 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13441 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13442 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13443 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13444 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13445
13446 puDst->au64[0] = uDst.au64[0];
13447 puDst->au64[1] = uDst.au64[1];
13448 puDst->au64[2] = uDst.au64[2];
13449 puDst->au64[3] = uDst.au64[3];
13450}
13451
13452
13453/*
13454 * PMULHRSW / VPMULHRSW
13455 */
13456#define DO_PMULHRSW(a_Src1, a_Src2) \
13457 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13458
13459IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13460{
13461 RTUINT64U uSrc1 = { *puDst };
13462 RTUINT64U uSrc2 = { *puSrc };
13463 RTUINT64U uDst;
13464
13465 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13466 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13467 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13468 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13469 *puDst = uDst.u;
13470 RT_NOREF(pFpuState);
13471}
13472
13473
13474IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13475{
13476 RTUINT128U uSrc1 = *puDst;
13477
13478 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13479 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13480 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13481 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13482 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13483 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13484 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13485 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13486 RT_NOREF(pFpuState);
13487}
13488
13489
13490IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13491{
13492 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13493
13494 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13495 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13496 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13497 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13498 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13499 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13500 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13501 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13502
13503 puDst->au64[0] = uDst.au64[0];
13504 puDst->au64[1] = uDst.au64[1];
13505}
13506
13507
13508IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13509{
13510 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13511
13512 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13513 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13514 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13515 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13516 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13517 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13518 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13519 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13520 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13521 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13522 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13523 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13524 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13525 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13526 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13527 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13528
13529 puDst->au64[0] = uDst.au64[0];
13530 puDst->au64[1] = uDst.au64[1];
13531 puDst->au64[2] = uDst.au64[2];
13532 puDst->au64[3] = uDst.au64[3];
13533}
13534
13535
13536/*
13537 * PSADBW / VPSADBW
13538 */
13539#ifdef IEM_WITHOUT_ASSEMBLY
13540
13541IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13542{
13543 RTUINT64U uSrc1 = { *puDst };
13544 RTUINT64U uSrc2 = { *puSrc };
13545 RTUINT64U uDst;
13546 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13547 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13548 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13549 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13550 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13551 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13552 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13553 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13554
13555 uDst.au64[0] = 0;
13556 uDst.au16[0] = uSum;
13557 *puDst = uDst.u;
13558}
13559
13560
13561IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13562{
13563 RTUINT128U uSrc1 = *puDst;
13564
13565 puDst->au64[0] = 0;
13566 puDst->au64[1] = 0;
13567
13568 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13569 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13570 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13571 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13572 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13573 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13574 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13575 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13576 puDst->au16[0] = uSum;
13577
13578 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13579 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13580 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13581 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13582 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13583 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13584 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13585 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13586 puDst->au16[4] = uSum;
13587}
13588
13589#endif
13590
13591IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13592{
13593 RTUINT128U uSrc1 = *puSrc1;
13594 RTUINT128U uSrc2 = *puSrc2;
13595
13596 puDst->au64[0] = 0;
13597 puDst->au64[1] = 0;
13598
13599 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13600 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13601 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13602 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13603 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13604 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13605 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13606 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13607 puDst->au16[0] = uSum;
13608
13609 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13610 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13611 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13612 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13613 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13614 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13615 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13616 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13617 puDst->au16[4] = uSum;
13618}
13619
13620IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13621{
13622 RTUINT256U uSrc1 = *puSrc1;
13623 RTUINT256U uSrc2 = *puSrc2;
13624
13625 puDst->au64[0] = 0;
13626 puDst->au64[1] = 0;
13627 puDst->au64[2] = 0;
13628 puDst->au64[3] = 0;
13629
13630 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13631 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13632 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13633 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13634 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13635 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13636 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13637 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13638 puDst->au16[0] = uSum;
13639
13640 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13641 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13642 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13643 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13644 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13645 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13646 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13647 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13648 puDst->au16[4] = uSum;
13649
13650 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13651 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13652 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13653 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13654 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13655 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13656 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13657 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13658 puDst->au16[8] = uSum;
13659
13660 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13661 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13662 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13663 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13664 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13665 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13666 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13667 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13668 puDst->au16[12] = uSum;
13669}
13670
13671
13672/*
13673 * PMULDQ / VPMULDQ
13674 */
13675IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13676{
13677 RTUINT128U uSrc1 = *puDst;
13678
13679 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13680 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13681}
13682
13683IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13684{
13685 RTUINT128U uSrc1 = *puSrc1;
13686 RTUINT128U uSrc2 = *puSrc2;
13687
13688 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13689 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13690}
13691
13692IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13693{
13694 RTUINT256U uSrc1 = *puSrc1;
13695 RTUINT256U uSrc2 = *puSrc2;
13696
13697 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13698 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13699 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13700 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13701}
13702
13703
13704/*
13705 * PMULUDQ / VPMULUDQ
13706 */
13707#ifdef IEM_WITHOUT_ASSEMBLY
13708
13709IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13710{
13711 RTUINT64U uSrc1 = { *puDst };
13712 RTUINT64U uSrc2 = { *puSrc };
13713 ASMCompilerBarrier();
13714 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13715 RT_NOREF(pFpuState);
13716}
13717
13718
13719IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13720{
13721 RTUINT128U uSrc1 = *puDst;
13722 RTUINT128U uSrc2 = *puSrc;
13723 ASMCompilerBarrier();
13724 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13725 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13726 RT_NOREF(pFpuState);
13727}
13728
13729#endif
13730
13731IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13732{
13733 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13734 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13735 ASMCompilerBarrier();
13736 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13737 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13738}
13739
13740
13741IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13742{
13743 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13744 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13745 ASMCompilerBarrier();
13746 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13747 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13748 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13749 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13750}
13751
13752
13753/*
13754 * UNPCKLPS / VUNPCKLPS
13755 */
13756#ifdef IEM_WITHOUT_ASSEMBLY
13757IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13758{
13759 RTUINT128U uSrc1 = *puDst;
13760 RTUINT128U uSrc2 = *puSrc;
13761 ASMCompilerBarrier();
13762 puDst->au32[0] = uSrc1.au32[0];
13763 puDst->au32[1] = uSrc2.au32[0];
13764 puDst->au32[2] = uSrc1.au32[1];
13765 puDst->au32[3] = uSrc2.au32[1];
13766}
13767
13768#endif
13769
13770IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13771{
13772 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13773 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13774 ASMCompilerBarrier();
13775 puDst->au32[0] = uSrc1.au32[0];
13776 puDst->au32[1] = uSrc2.au32[0];
13777 puDst->au32[2] = uSrc1.au32[1];
13778 puDst->au32[3] = uSrc2.au32[1];
13779}
13780
13781
13782IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13783{
13784 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13785 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13786 ASMCompilerBarrier();
13787 puDst->au32[0] = uSrc1.au32[0];
13788 puDst->au32[1] = uSrc2.au32[0];
13789 puDst->au32[2] = uSrc1.au32[1];
13790 puDst->au32[3] = uSrc2.au32[1];
13791
13792 puDst->au32[4] = uSrc1.au32[4];
13793 puDst->au32[5] = uSrc2.au32[4];
13794 puDst->au32[6] = uSrc1.au32[5];
13795 puDst->au32[7] = uSrc2.au32[5];
13796}
13797
13798
13799/*
13800 * UNPCKLPD / VUNPCKLPD
13801 */
13802#ifdef IEM_WITHOUT_ASSEMBLY
13803IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13804{
13805 RTUINT128U uSrc1 = *puDst;
13806 RTUINT128U uSrc2 = *puSrc;
13807 ASMCompilerBarrier();
13808 puDst->au64[0] = uSrc1.au64[0];
13809 puDst->au64[1] = uSrc2.au64[0];
13810}
13811
13812#endif
13813
13814IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13815{
13816 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13817 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13818 ASMCompilerBarrier();
13819 puDst->au64[0] = uSrc1.au64[0];
13820 puDst->au64[1] = uSrc2.au64[0];
13821}
13822
13823
13824IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13825{
13826 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13827 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13828 ASMCompilerBarrier();
13829 puDst->au64[0] = uSrc1.au64[0];
13830 puDst->au64[1] = uSrc2.au64[0];
13831 puDst->au64[2] = uSrc1.au64[2];
13832 puDst->au64[3] = uSrc2.au64[2];
13833}
13834
13835
13836/*
13837 * UNPCKHPS / VUNPCKHPS
13838 */
13839#ifdef IEM_WITHOUT_ASSEMBLY
13840IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13841{
13842 RTUINT128U uSrc1 = *puDst;
13843 RTUINT128U uSrc2 = *puSrc;
13844 ASMCompilerBarrier();
13845 puDst->au32[0] = uSrc1.au32[2];
13846 puDst->au32[1] = uSrc2.au32[2];
13847 puDst->au32[2] = uSrc1.au32[3];
13848 puDst->au32[3] = uSrc2.au32[3];
13849}
13850
13851#endif
13852
13853IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13854{
13855 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13856 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13857 ASMCompilerBarrier();
13858 puDst->au32[0] = uSrc1.au32[2];
13859 puDst->au32[1] = uSrc2.au32[2];
13860 puDst->au32[2] = uSrc1.au32[3];
13861 puDst->au32[3] = uSrc2.au32[3];
13862}
13863
13864
13865IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13866{
13867 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13868 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13869 ASMCompilerBarrier();
13870 puDst->au32[0] = uSrc1.au32[2];
13871 puDst->au32[1] = uSrc2.au32[2];
13872 puDst->au32[2] = uSrc1.au32[3];
13873 puDst->au32[3] = uSrc2.au32[3];
13874
13875 puDst->au32[4] = uSrc1.au32[6];
13876 puDst->au32[5] = uSrc2.au32[6];
13877 puDst->au32[6] = uSrc1.au32[7];
13878 puDst->au32[7] = uSrc2.au32[7];
13879}
13880
13881
13882/*
13883 * UNPCKHPD / VUNPCKHPD
13884 */
13885#ifdef IEM_WITHOUT_ASSEMBLY
13886IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13887{
13888 RTUINT128U uSrc1 = *puDst;
13889 RTUINT128U uSrc2 = *puSrc;
13890 ASMCompilerBarrier();
13891 puDst->au64[0] = uSrc1.au64[1];
13892 puDst->au64[1] = uSrc2.au64[1];
13893}
13894
13895#endif
13896
13897IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13898{
13899 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13900 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13901 ASMCompilerBarrier();
13902 puDst->au64[0] = uSrc1.au64[1];
13903 puDst->au64[1] = uSrc2.au64[1];
13904}
13905
13906
13907IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13908{
13909 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13910 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13911 ASMCompilerBarrier();
13912 puDst->au64[0] = uSrc1.au64[1];
13913 puDst->au64[1] = uSrc2.au64[1];
13914 puDst->au64[2] = uSrc1.au64[3];
13915 puDst->au64[3] = uSrc2.au64[3];
13916}
13917
13918
13919/*
13920 * CRC32 (SEE 4.2).
13921 */
13922
13923IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
13924{
13925 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13926}
13927
13928
13929IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
13930{
13931 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13932}
13933
13934IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
13935{
13936 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13937}
13938
13939IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
13940{
13941 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
13942}
13943
13944
13945/*
13946 * PTEST (SSE 4.1) - special as it output only EFLAGS.
13947 */
13948#ifdef IEM_WITHOUT_ASSEMBLY
13949IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
13950{
13951 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13952 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13953 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13954 fEfl |= X86_EFL_ZF;
13955 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13956 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
13957 fEfl |= X86_EFL_CF;
13958 *pfEFlags = fEfl;
13959}
13960#endif
13961
13962IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
13963{
13964 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
13965 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
13966 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
13967 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
13968 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
13969 fEfl |= X86_EFL_ZF;
13970 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
13971 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
13972 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
13973 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
13974 fEfl |= X86_EFL_CF;
13975 *pfEFlags = fEfl;
13976}
13977
13978
13979/*
13980 * PMOVSXBW / VPMOVSXBW
13981 */
13982IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
13983{
13984 RTUINT64U uSrc1 = { uSrc };
13985 puDst->ai16[0] = uSrc1.ai8[0];
13986 puDst->ai16[1] = uSrc1.ai8[1];
13987 puDst->ai16[2] = uSrc1.ai8[2];
13988 puDst->ai16[3] = uSrc1.ai8[3];
13989 puDst->ai16[4] = uSrc1.ai8[4];
13990 puDst->ai16[5] = uSrc1.ai8[5];
13991 puDst->ai16[6] = uSrc1.ai8[6];
13992 puDst->ai16[7] = uSrc1.ai8[7];
13993}
13994
13995
13996IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
13997{
13998 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
13999 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14000 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14001 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14002 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14003 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14004 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14005 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14006 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14007 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14008 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14009 puDst->ai16[10] = uSrc1.ai8[10];
14010 puDst->ai16[11] = uSrc1.ai8[11];
14011 puDst->ai16[12] = uSrc1.ai8[12];
14012 puDst->ai16[13] = uSrc1.ai8[13];
14013 puDst->ai16[14] = uSrc1.ai8[14];
14014 puDst->ai16[15] = uSrc1.ai8[15];
14015}
14016
14017
14018/*
14019 * PMOVSXBD / VPMOVSXBD
14020 */
14021IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14022{
14023 RTUINT32U uSrc1 = { uSrc };
14024 puDst->ai32[0] = uSrc1.ai8[0];
14025 puDst->ai32[1] = uSrc1.ai8[1];
14026 puDst->ai32[2] = uSrc1.ai8[2];
14027 puDst->ai32[3] = uSrc1.ai8[3];
14028}
14029
14030
14031IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14032{
14033 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14034 puDst->ai32[0] = uSrc1.ai8[0];
14035 puDst->ai32[1] = uSrc1.ai8[1];
14036 puDst->ai32[2] = uSrc1.ai8[2];
14037 puDst->ai32[3] = uSrc1.ai8[3];
14038 puDst->ai32[4] = uSrc1.ai8[4];
14039 puDst->ai32[5] = uSrc1.ai8[5];
14040 puDst->ai32[6] = uSrc1.ai8[6];
14041 puDst->ai32[7] = uSrc1.ai8[7];
14042}
14043
14044
14045/*
14046 * PMOVSXBQ / VPMOVSXBQ
14047 */
14048IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14049{
14050 RTUINT16U uSrc1 = { uSrc };
14051 puDst->ai64[0] = uSrc1.ai8[0];
14052 puDst->ai64[1] = uSrc1.ai8[1];
14053}
14054
14055
14056IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14057{
14058 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14059 puDst->ai64[0] = uSrc1.ai8[0];
14060 puDst->ai64[1] = uSrc1.ai8[1];
14061 puDst->ai64[2] = uSrc1.ai8[2];
14062 puDst->ai64[3] = uSrc1.ai8[3];
14063}
14064
14065
14066/*
14067 * PMOVSXWD / VPMOVSXWD
14068 */
14069IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14070{
14071 RTUINT64U uSrc1 = { uSrc };
14072 puDst->ai32[0] = uSrc1.ai16[0];
14073 puDst->ai32[1] = uSrc1.ai16[1];
14074 puDst->ai32[2] = uSrc1.ai16[2];
14075 puDst->ai32[3] = uSrc1.ai16[3];
14076}
14077
14078
14079IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14080{
14081 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14082 puDst->ai32[0] = uSrc1.ai16[0];
14083 puDst->ai32[1] = uSrc1.ai16[1];
14084 puDst->ai32[2] = uSrc1.ai16[2];
14085 puDst->ai32[3] = uSrc1.ai16[3];
14086 puDst->ai32[4] = uSrc1.ai16[4];
14087 puDst->ai32[5] = uSrc1.ai16[5];
14088 puDst->ai32[6] = uSrc1.ai16[6];
14089 puDst->ai32[7] = uSrc1.ai16[7];
14090}
14091
14092
14093/*
14094 * PMOVSXWQ / VPMOVSXWQ
14095 */
14096IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14097{
14098 RTUINT32U uSrc1 = { uSrc };
14099 puDst->ai64[0] = uSrc1.ai16[0];
14100 puDst->ai64[1] = uSrc1.ai16[1];
14101}
14102
14103
14104IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14105{
14106 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14107 puDst->ai64[0] = uSrc1.ai16[0];
14108 puDst->ai64[1] = uSrc1.ai16[1];
14109 puDst->ai64[2] = uSrc1.ai16[2];
14110 puDst->ai64[3] = uSrc1.ai16[3];
14111}
14112
14113
14114/*
14115 * PMOVSXDQ / VPMOVSXDQ
14116 */
14117IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14118{
14119 RTUINT64U uSrc1 = { uSrc };
14120 puDst->ai64[0] = uSrc1.ai32[0];
14121 puDst->ai64[1] = uSrc1.ai32[1];
14122}
14123
14124
14125IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14126{
14127 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14128 puDst->ai64[0] = uSrc1.ai32[0];
14129 puDst->ai64[1] = uSrc1.ai32[1];
14130 puDst->ai64[2] = uSrc1.ai32[2];
14131 puDst->ai64[3] = uSrc1.ai32[3];
14132}
14133
14134
14135/*
14136 * PMOVZXBW / VPMOVZXBW
14137 */
14138IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14139{
14140 RTUINT64U uSrc1 = { uSrc };
14141 puDst->au16[0] = uSrc1.au8[0];
14142 puDst->au16[1] = uSrc1.au8[1];
14143 puDst->au16[2] = uSrc1.au8[2];
14144 puDst->au16[3] = uSrc1.au8[3];
14145 puDst->au16[4] = uSrc1.au8[4];
14146 puDst->au16[5] = uSrc1.au8[5];
14147 puDst->au16[6] = uSrc1.au8[6];
14148 puDst->au16[7] = uSrc1.au8[7];
14149}
14150
14151
14152IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14153{
14154 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14155 puDst->au16[ 0] = uSrc1.au8[ 0];
14156 puDst->au16[ 1] = uSrc1.au8[ 1];
14157 puDst->au16[ 2] = uSrc1.au8[ 2];
14158 puDst->au16[ 3] = uSrc1.au8[ 3];
14159 puDst->au16[ 4] = uSrc1.au8[ 4];
14160 puDst->au16[ 5] = uSrc1.au8[ 5];
14161 puDst->au16[ 6] = uSrc1.au8[ 6];
14162 puDst->au16[ 7] = uSrc1.au8[ 7];
14163 puDst->au16[ 8] = uSrc1.au8[ 8];
14164 puDst->au16[ 9] = uSrc1.au8[ 9];
14165 puDst->au16[10] = uSrc1.au8[10];
14166 puDst->au16[11] = uSrc1.au8[11];
14167 puDst->au16[12] = uSrc1.au8[12];
14168 puDst->au16[13] = uSrc1.au8[13];
14169 puDst->au16[14] = uSrc1.au8[14];
14170 puDst->au16[15] = uSrc1.au8[15];
14171}
14172
14173
14174/*
14175 * PMOVZXBD / VPMOVZXBD
14176 */
14177IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14178{
14179 RTUINT32U uSrc1 = { uSrc };
14180 puDst->au32[0] = uSrc1.au8[0];
14181 puDst->au32[1] = uSrc1.au8[1];
14182 puDst->au32[2] = uSrc1.au8[2];
14183 puDst->au32[3] = uSrc1.au8[3];
14184}
14185
14186
14187IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14188{
14189 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14190 puDst->au32[0] = uSrc1.au8[0];
14191 puDst->au32[1] = uSrc1.au8[1];
14192 puDst->au32[2] = uSrc1.au8[2];
14193 puDst->au32[3] = uSrc1.au8[3];
14194 puDst->au32[4] = uSrc1.au8[4];
14195 puDst->au32[5] = uSrc1.au8[5];
14196 puDst->au32[6] = uSrc1.au8[6];
14197 puDst->au32[7] = uSrc1.au8[7];
14198}
14199
14200
14201/*
14202 * PMOVZXBQ / VPMOVZXBQ
14203 */
14204IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14205{
14206 RTUINT16U uSrc1 = { uSrc };
14207 puDst->au64[0] = uSrc1.au8[0];
14208 puDst->au64[1] = uSrc1.au8[1];
14209}
14210
14211
14212IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14213{
14214 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14215 puDst->au64[0] = uSrc1.au8[0];
14216 puDst->au64[1] = uSrc1.au8[1];
14217 puDst->au64[2] = uSrc1.au8[2];
14218 puDst->au64[3] = uSrc1.au8[3];
14219}
14220
14221
14222/*
14223 * PMOVZXWD / VPMOVZXWD
14224 */
14225IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14226{
14227 RTUINT64U uSrc1 = { uSrc };
14228 puDst->au32[0] = uSrc1.au16[0];
14229 puDst->au32[1] = uSrc1.au16[1];
14230 puDst->au32[2] = uSrc1.au16[2];
14231 puDst->au32[3] = uSrc1.au16[3];
14232}
14233
14234
14235IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14236{
14237 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14238 puDst->au32[0] = uSrc1.au16[0];
14239 puDst->au32[1] = uSrc1.au16[1];
14240 puDst->au32[2] = uSrc1.au16[2];
14241 puDst->au32[3] = uSrc1.au16[3];
14242 puDst->au32[4] = uSrc1.au16[4];
14243 puDst->au32[5] = uSrc1.au16[5];
14244 puDst->au32[6] = uSrc1.au16[6];
14245 puDst->au32[7] = uSrc1.au16[7];
14246}
14247
14248
14249/*
14250 * PMOVZXWQ / VPMOVZXWQ
14251 */
14252IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14253{
14254 RTUINT32U uSrc1 = { uSrc };
14255 puDst->au64[0] = uSrc1.au16[0];
14256 puDst->au64[1] = uSrc1.au16[1];
14257}
14258
14259
14260IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14261{
14262 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14263 puDst->au64[0] = uSrc1.au16[0];
14264 puDst->au64[1] = uSrc1.au16[1];
14265 puDst->au64[2] = uSrc1.au16[2];
14266 puDst->au64[3] = uSrc1.au16[3];
14267}
14268
14269
14270/*
14271 * PMOVZXDQ / VPMOVZXDQ
14272 */
14273IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14274{
14275 RTUINT64U uSrc1 = { uSrc };
14276 puDst->au64[0] = uSrc1.au32[0];
14277 puDst->au64[1] = uSrc1.au32[1];
14278}
14279
14280
14281IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14282{
14283 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14284 puDst->au64[0] = uSrc1.au32[0];
14285 puDst->au64[1] = uSrc1.au32[1];
14286 puDst->au64[2] = uSrc1.au32[2];
14287 puDst->au64[3] = uSrc1.au32[3];
14288}
14289
14290
14291#ifdef IEM_WITHOUT_ASSEMBLY
14292/**
14293 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14294 * the SoftFloat 32-bit floating point format (float32_t).
14295 *
14296 * This is only a structure format conversion, nothing else.
14297 */
14298DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14299{
14300 float32_t Tmp;
14301 Tmp.v = pr32Val->u;
14302 return Tmp;
14303}
14304
14305
14306/**
14307 * Converts from SoftFloat 32-bit floating point format (float32_t)
14308 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14309 *
14310 * This is only a structure format conversion, nothing else.
14311 */
14312DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14313{
14314 pr32Dst->u = r32XSrc.v;
14315 return pr32Dst;
14316}
14317
14318
14319/**
14320 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14321 * the SoftFloat 64-bit floating point format (float64_t).
14322 *
14323 * This is only a structure format conversion, nothing else.
14324 */
14325DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14326{
14327 float64_t Tmp;
14328 Tmp.v = pr64Val->u;
14329 return Tmp;
14330}
14331
14332
14333/**
14334 * Converts from SoftFloat 64-bit floating point format (float64_t)
14335 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14336 *
14337 * This is only a structure format conversion, nothing else.
14338 */
14339DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14340{
14341 pr64Dst->u = r64XSrc.v;
14342 return pr64Dst;
14343}
14344
14345
14346/** Initializer for the SoftFloat state structure. */
14347# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14348 { \
14349 softfloat_tininess_afterRounding, \
14350 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14351 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14352 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14353 : (uint8_t)softfloat_round_minMag, \
14354 0, \
14355 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14356 32 /* Rounding precision, not relevant for SIMD. */ \
14357 }
14358
14359
14360/**
14361 * Helper for transfering exception to MXCSR and setting the result value
14362 * accordingly.
14363 *
14364 * @returns Updated MXCSR.
14365 * @param pSoftState The SoftFloat state following the operation.
14366 * @param r32Result The result of the SoftFloat operation.
14367 * @param pr32Result Where to store the result for IEM.
14368 * @param fMxcsr The original MXCSR value.
14369 */
14370DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14371 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14372{
14373 iemFpSoftF32ToIprt(pr32Result, r32Result);
14374
14375 uint8_t fXcpt = pSoftState->exceptionFlags;
14376 if ( (fMxcsr & X86_MXCSR_FZ)
14377 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14378 {
14379 /* Underflow masked and flush to zero is set. */
14380 pr32Result->s.uFraction = 0;
14381 pr32Result->s.uExponent = 0;
14382 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14383 }
14384
14385 /* If DAZ is set \#DE is never set. */
14386 if ( fMxcsr & X86_MXCSR_DAZ
14387 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14388 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14389 fXcpt &= ~X86_MXCSR_DE;
14390
14391 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14392}
14393
14394
14395/**
14396 * Helper for transfering exception to MXCSR and setting the result value
14397 * accordingly - ignores Flush-to-Zero.
14398 *
14399 * @returns Updated MXCSR.
14400 * @param pSoftState The SoftFloat state following the operation.
14401 * @param r32Result The result of the SoftFloat operation.
14402 * @param pr32Result Where to store the result for IEM.
14403 * @param fMxcsr The original MXCSR value.
14404 */
14405DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14406 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14407{
14408 iemFpSoftF32ToIprt(pr32Result, r32Result);
14409
14410 uint8_t fXcpt = pSoftState->exceptionFlags;
14411 /* If DAZ is set \#DE is never set. */
14412 if ( fMxcsr & X86_MXCSR_DAZ
14413 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14414 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14415 fXcpt &= ~X86_MXCSR_DE;
14416
14417 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14418}
14419
14420
14421/**
14422 * Helper for transfering exception to MXCSR and setting the result value
14423 * accordingly.
14424 *
14425 * @returns Updated MXCSR.
14426 * @param pSoftState The SoftFloat state following the operation.
14427 * @param r64Result The result of the SoftFloat operation.
14428 * @param pr64Result Where to store the result for IEM.
14429 * @param fMxcsr The original MXCSR value.
14430 */
14431DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14432 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14433{
14434 iemFpSoftF64ToIprt(pr64Result, r64Result);
14435 uint8_t fXcpt = pSoftState->exceptionFlags;
14436 if ( (fMxcsr & X86_MXCSR_FZ)
14437 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14438 {
14439 /* Underflow masked and flush to zero is set. */
14440 iemFpSoftF64ToIprt(pr64Result, r64Result);
14441 pr64Result->s.uFractionHigh = 0;
14442 pr64Result->s.uFractionLow = 0;
14443 pr64Result->s.uExponent = 0;
14444 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14445 }
14446
14447 /* If DAZ is set \#DE is never set. */
14448 if ( fMxcsr & X86_MXCSR_DAZ
14449 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14450 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14451 fXcpt &= ~X86_MXCSR_DE;
14452
14453 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14454}
14455
14456
14457/**
14458 * Helper for transfering exception to MXCSR and setting the result value
14459 * accordingly - ignores Flush-to-Zero.
14460 *
14461 * @returns Updated MXCSR.
14462 * @param pSoftState The SoftFloat state following the operation.
14463 * @param r64Result The result of the SoftFloat operation.
14464 * @param pr64Result Where to store the result for IEM.
14465 * @param fMxcsr The original MXCSR value.
14466 */
14467DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14468 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14469{
14470 iemFpSoftF64ToIprt(pr64Result, r64Result);
14471
14472 uint8_t fXcpt = pSoftState->exceptionFlags;
14473 /* If DAZ is set \#DE is never set. */
14474 if ( fMxcsr & X86_MXCSR_DAZ
14475 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14476 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14477 fXcpt &= ~X86_MXCSR_DE;
14478
14479 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14480}
14481
14482
14483/**
14484 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14485 * in MXCSR into account.
14486 *
14487 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14488 * @param pr32Val Where to store the result.
14489 * @param fMxcsr The input MXCSR value.
14490 * @param pr32Src The value to use.
14491 */
14492DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14493{
14494 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14495 {
14496 if (fMxcsr & X86_MXCSR_DAZ)
14497 {
14498 /* De-normals are changed to 0. */
14499 pr32Val->s.fSign = pr32Src->s.fSign;
14500 pr32Val->s.uFraction = 0;
14501 pr32Val->s.uExponent = 0;
14502 return 0;
14503 }
14504
14505 *pr32Val = *pr32Src;
14506 return X86_MXCSR_DE;
14507 }
14508
14509 *pr32Val = *pr32Src;
14510 return 0;
14511}
14512
14513
14514/**
14515 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14516 * in MXCSR into account.
14517 *
14518 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14519 * @param pr64Val Where to store the result.
14520 * @param fMxcsr The input MXCSR value.
14521 * @param pr64Src The value to use.
14522 */
14523DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14524{
14525 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14526 {
14527 if (fMxcsr & X86_MXCSR_DAZ)
14528 {
14529 /* De-normals are changed to 0. */
14530 pr64Val->s64.fSign = pr64Src->s.fSign;
14531 pr64Val->s64.uFraction = 0;
14532 pr64Val->s64.uExponent = 0;
14533 return 0;
14534 }
14535
14536 *pr64Val = *pr64Src;
14537 return X86_MXCSR_DE;
14538 }
14539
14540 *pr64Val = *pr64Src;
14541 return 0;
14542}
14543
14544
14545/**
14546 * Validates the given input operands returning whether the operation can continue or whether one
14547 * of the source operands contains a NaN value, setting the output accordingly.
14548 *
14549 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14550 * @param pr32Res Where to store the result in case the operation can't continue.
14551 * @param pr32Val1 The first input operand.
14552 * @param pr32Val2 The second input operand.
14553 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14554 */
14555DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14556{
14557 uint8_t cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14558 uint8_t cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14559 if (cSNan + cQNan == 2)
14560 {
14561 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14562 *pr32Res = *pr32Val1;
14563 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14564 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14565 return true;
14566 }
14567 else if (cSNan)
14568 {
14569 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14570 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14571 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14572 *pfMxcsr |= X86_MXCSR_IE;
14573 return true;
14574 }
14575 else if (cQNan)
14576 {
14577 /* The QNan operand is placed into the result. */
14578 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14579 return true;
14580 }
14581
14582 Assert(!cQNan && !cSNan);
14583 return false;
14584}
14585
14586
14587/**
14588 * Validates the given double precision input operands returning whether the operation can continue or whether one
14589 * of the source operands contains a NaN value, setting the output accordingly.
14590 *
14591 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14592 * @param pr64Res Where to store the result in case the operation can't continue.
14593 * @param pr64Val1 The first input operand.
14594 * @param pr64Val2 The second input operand.
14595 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14596 */
14597DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14598{
14599 uint8_t cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14600 uint8_t cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14601 if (cSNan + cQNan == 2)
14602 {
14603 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14604 *pr64Res = *pr64Val1;
14605 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14606 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14607 return true;
14608 }
14609 else if (cSNan)
14610 {
14611 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14612 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14613 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14614 *pfMxcsr |= X86_MXCSR_IE;
14615 return true;
14616 }
14617 else if (cQNan)
14618 {
14619 /* The QNan operand is placed into the result. */
14620 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14621 return true;
14622 }
14623
14624 Assert(!cQNan && !cSNan);
14625 return false;
14626}
14627
14628
14629/**
14630 * Validates the given single input operand returning whether the operation can continue or whether
14631 * contains a NaN value, setting the output accordingly.
14632 *
14633 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14634 * @param pr32Res Where to store the result in case the operation can't continue.
14635 * @param pr32Val The input operand.
14636 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14637 */
14638DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
14639{
14640 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
14641 {
14642 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14643 *pr32Res = *pr32Val;
14644 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14645 *pfMxcsr |= X86_MXCSR_IE;
14646 return true;
14647 }
14648 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
14649 {
14650 /* The QNan operand is placed into the result. */
14651 *pr32Res = *pr32Val;
14652 return true;
14653 }
14654
14655 return false;
14656}
14657
14658
14659/**
14660 * Validates the given double input operand returning whether the operation can continue or whether
14661 * contains a NaN value, setting the output accordingly.
14662 *
14663 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14664 * @param pr64Res Where to store the result in case the operation can't continue.
14665 * @param pr64Val The input operand.
14666 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14667 */
14668DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
14669{
14670 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
14671 {
14672 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14673 *pr64Res = *pr64Val;
14674 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14675 *pfMxcsr |= X86_MXCSR_IE;
14676 return true;
14677 }
14678 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
14679 {
14680 /* The QNan operand is placed into the result. */
14681 *pr64Res = *pr64Val;
14682 return true;
14683 }
14684
14685 return false;
14686}
14687#endif
14688
14689
14690/**
14691 * ADDPS
14692 */
14693#ifdef IEM_WITHOUT_ASSEMBLY
14694static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14695{
14696 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14697 return fMxcsr;
14698
14699 RTFLOAT32U r32Src1, r32Src2;
14700 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14701 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14702 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14703 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14704 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14705}
14706
14707
14708IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14709{
14710 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14711 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14712 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14713 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14714}
14715#endif
14716
14717
14718/**
14719 * ADDSS
14720 */
14721#ifdef IEM_WITHOUT_ASSEMBLY
14722IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14723{
14724 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14725 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14726 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14727 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14728}
14729#endif
14730
14731
14732/**
14733 * ADDPD
14734 */
14735#ifdef IEM_WITHOUT_ASSEMBLY
14736static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14737{
14738 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14739 return fMxcsr;
14740
14741 RTFLOAT64U r64Src1, r64Src2;
14742 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14743 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14744 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14745 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14746 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14747}
14748
14749
14750IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14751{
14752 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14753 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14754}
14755#endif
14756
14757
14758/**
14759 * ADDSD
14760 */
14761#ifdef IEM_WITHOUT_ASSEMBLY
14762IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14763{
14764 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14765 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14766}
14767#endif
14768
14769
14770/**
14771 * MULPS
14772 */
14773#ifdef IEM_WITHOUT_ASSEMBLY
14774static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14775{
14776 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14777 return fMxcsr;
14778
14779 RTFLOAT32U r32Src1, r32Src2;
14780 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14781 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14782 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14783 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14784 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14785}
14786
14787
14788IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14789{
14790 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14791 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14792 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14793 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14794}
14795#endif
14796
14797
14798/**
14799 * MULSS
14800 */
14801#ifdef IEM_WITHOUT_ASSEMBLY
14802IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14803{
14804 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14805 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14806 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14807 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14808}
14809#endif
14810
14811
14812/**
14813 * MULPD
14814 */
14815#ifdef IEM_WITHOUT_ASSEMBLY
14816static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14817{
14818 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14819 return fMxcsr;
14820
14821 RTFLOAT64U r64Src1, r64Src2;
14822 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14823 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14824 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14825 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14826 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14827}
14828
14829
14830IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14831{
14832 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14833 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14834}
14835#endif
14836
14837
14838/**
14839 * MULSD
14840 */
14841#ifdef IEM_WITHOUT_ASSEMBLY
14842IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14843{
14844 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14845 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14846}
14847#endif
14848
14849
14850/**
14851 * SUBPS
14852 */
14853#ifdef IEM_WITHOUT_ASSEMBLY
14854static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14855{
14856 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14857 return fMxcsr;
14858
14859 RTFLOAT32U r32Src1, r32Src2;
14860 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14861 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14862 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14863 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14864 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14865}
14866
14867
14868IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14869{
14870 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14871 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14872 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14873 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14874}
14875#endif
14876
14877
14878/**
14879 * SUBSS
14880 */
14881#ifdef IEM_WITHOUT_ASSEMBLY
14882IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14883{
14884 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14885 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14886 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14887 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14888}
14889#endif
14890
14891
14892/**
14893 * SUBPD
14894 */
14895#ifdef IEM_WITHOUT_ASSEMBLY
14896static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14897{
14898 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14899 return fMxcsr;
14900
14901 RTFLOAT64U r64Src1, r64Src2;
14902 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14903 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14904 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14905 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14906 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14907}
14908
14909
14910IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14911{
14912 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14913 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14914}
14915#endif
14916
14917
14918/**
14919 * SUBSD
14920 */
14921#ifdef IEM_WITHOUT_ASSEMBLY
14922IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14923{
14924 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14925 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14926}
14927#endif
14928
14929
14930/**
14931 * MINPS
14932 */
14933#ifdef IEM_WITHOUT_ASSEMBLY
14934static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14935{
14936 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
14937 {
14938 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14939 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
14940 return fMxcsr | X86_MXCSR_IE;
14941 }
14942
14943 RTFLOAT32U r32Src1, r32Src2;
14944 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14945 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14946 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
14947 {
14948 *pr32Res = r32Src2;
14949 return fMxcsr;
14950 }
14951
14952 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14953 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14954 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
14955 fLe
14956 ? iemFpSoftF32FromIprt(&r32Src1)
14957 : iemFpSoftF32FromIprt(&r32Src2),
14958 pr32Res, fMxcsr);
14959}
14960
14961
14962IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14963{
14964 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14965 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14966 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14967 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14968}
14969#endif
14970
14971
14972/**
14973 * MINSS
14974 */
14975#ifdef IEM_WITHOUT_ASSEMBLY
14976IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14977{
14978 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14979 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14980 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14981 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14982}
14983#endif
14984
14985
14986/**
14987 * MINPD
14988 */
14989#ifdef IEM_WITHOUT_ASSEMBLY
14990static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14991{
14992 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
14993 {
14994 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
14995 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
14996 return fMxcsr | X86_MXCSR_IE;
14997 }
14998
14999 RTFLOAT64U r64Src1, r64Src2;
15000 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15001 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15002 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15003 {
15004 *pr64Res = r64Src2;
15005 return fMxcsr;
15006 }
15007
15008 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15009 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15010 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15011 fLe
15012 ? iemFpSoftF64FromIprt(&r64Src1)
15013 : iemFpSoftF64FromIprt(&r64Src2),
15014 pr64Res, fMxcsr);
15015}
15016
15017
15018IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15019{
15020 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15021 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15022}
15023#endif
15024
15025
15026/**
15027 * MINSD
15028 */
15029#ifdef IEM_WITHOUT_ASSEMBLY
15030IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15031{
15032 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15033 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15034}
15035#endif
15036
15037
15038/**
15039 * DIVPS
15040 */
15041#ifdef IEM_WITHOUT_ASSEMBLY
15042static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15043{
15044 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15045 return fMxcsr;
15046
15047 RTFLOAT32U r32Src1, r32Src2;
15048 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15049 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15050 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15051 {
15052 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15053 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15054 {
15055 *pr32Res = g_ar32QNaN[1];
15056 return fMxcsr | X86_MXCSR_IE;
15057 }
15058 else if (RTFLOAT32U_IS_INF(&r32Src1))
15059 {
15060 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15061 return fMxcsr;
15062 }
15063 else
15064 {
15065 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15066 return fMxcsr | X86_MXCSR_ZE;
15067 }
15068 }
15069
15070 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15071 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15072 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15073}
15074
15075
15076IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15077{
15078 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15079 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15080 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15081 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15082}
15083#endif
15084
15085
15086/**
15087 * DIVSS
15088 */
15089#ifdef IEM_WITHOUT_ASSEMBLY
15090IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15091{
15092 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15093 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15094 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15095 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15096}
15097#endif
15098
15099
15100/**
15101 * DIVPD
15102 */
15103#ifdef IEM_WITHOUT_ASSEMBLY
15104static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15105{
15106 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15107 return fMxcsr;
15108
15109 RTFLOAT64U r64Src1, r64Src2;
15110 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15111 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15112 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15113 {
15114 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15115 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15116 {
15117 *pr64Res = g_ar64QNaN[1];
15118 return fMxcsr | X86_MXCSR_IE;
15119 }
15120 else if (RTFLOAT64U_IS_INF(&r64Src1))
15121 {
15122 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15123 return fMxcsr;
15124 }
15125 else
15126 {
15127 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15128 return fMxcsr | X86_MXCSR_ZE;
15129 }
15130 }
15131
15132 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15133 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15134 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15135}
15136
15137
15138IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15139{
15140 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15141 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15142}
15143#endif
15144
15145
15146/**
15147 * DIVSD
15148 */
15149#ifdef IEM_WITHOUT_ASSEMBLY
15150IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15151{
15152 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15153 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15154}
15155#endif
15156
15157
15158/**
15159 * MAXPS
15160 */
15161#ifdef IEM_WITHOUT_ASSEMBLY
15162static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15163{
15164 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15165 {
15166 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15167 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15168 return fMxcsr | X86_MXCSR_IE;
15169 }
15170
15171 RTFLOAT32U r32Src1, r32Src2;
15172 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15173 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15174 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15175 {
15176 *pr32Res = r32Src2;
15177 return fMxcsr;
15178 }
15179
15180 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15181 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15182 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15183 fLe
15184 ? iemFpSoftF32FromIprt(&r32Src2)
15185 : iemFpSoftF32FromIprt(&r32Src1),
15186 pr32Res, fMxcsr);
15187}
15188
15189
15190IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15191{
15192 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15193 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15194 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15195 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15196}
15197#endif
15198
15199
15200/**
15201 * MAXSS
15202 */
15203#ifdef IEM_WITHOUT_ASSEMBLY
15204IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15205{
15206 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15207 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15208 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15209 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15210}
15211#endif
15212
15213
15214/**
15215 * MAXPD
15216 */
15217#ifdef IEM_WITHOUT_ASSEMBLY
15218static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15219{
15220 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15221 {
15222 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15223 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15224 return fMxcsr | X86_MXCSR_IE;
15225 }
15226
15227 RTFLOAT64U r64Src1, r64Src2;
15228 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15229 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15230 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15231 {
15232 *pr64Res = r64Src2;
15233 return fMxcsr;
15234 }
15235
15236 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15237 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15238 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15239 fLe
15240 ? iemFpSoftF64FromIprt(&r64Src2)
15241 : iemFpSoftF64FromIprt(&r64Src1),
15242 pr64Res, fMxcsr);
15243}
15244
15245
15246IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15247{
15248 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15249 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15250}
15251#endif
15252
15253
15254/**
15255 * MAXSD
15256 */
15257#ifdef IEM_WITHOUT_ASSEMBLY
15258IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15259{
15260 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15261 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15262}
15263#endif
15264
15265
15266/**
15267 * CVTSS2SD
15268 */
15269#ifdef IEM_WITHOUT_ASSEMBLY
15270static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15271{
15272 RTFLOAT32U r32Src1;
15273 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15274
15275 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15276 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15277 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15278}
15279
15280
15281IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15282{
15283 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15284 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15285}
15286#endif
15287
15288
15289/**
15290 * CVTSD2SS
15291 */
15292#ifdef IEM_WITHOUT_ASSEMBLY
15293static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15294{
15295 RTFLOAT64U r64Src1;
15296 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15297
15298 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15299 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15300 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15301}
15302
15303
15304IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15305{
15306 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15307 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15308 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15309 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15310}
15311#endif
15312
15313
15314/**
15315 * HADDPS
15316 */
15317#ifdef IEM_WITHOUT_ASSEMBLY
15318IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15319{
15320 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15321 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15322 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15323 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15324}
15325#endif
15326
15327
15328/**
15329 * HADDPD
15330 */
15331#ifdef IEM_WITHOUT_ASSEMBLY
15332IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15333{
15334 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15335 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15336}
15337#endif
15338
15339
15340/**
15341 * HSUBPS
15342 */
15343#ifdef IEM_WITHOUT_ASSEMBLY
15344IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15345{
15346 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15347 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15348 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15349 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15350}
15351#endif
15352
15353
15354/**
15355 * HSUBPD
15356 */
15357#ifdef IEM_WITHOUT_ASSEMBLY
15358IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15359{
15360 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15361 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15362}
15363#endif
15364
15365
15366/**
15367 * SQRTPS
15368 */
15369#ifdef IEM_WITHOUT_ASSEMBLY
15370static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15371{
15372 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15373 return fMxcsr;
15374
15375 RTFLOAT32U r32Src;
15376 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15377 if (RTFLOAT32U_IS_ZERO(&r32Src))
15378 {
15379 *pr32Res = r32Src;
15380 return fMxcsr;
15381 }
15382 else if (r32Src.s.fSign)
15383 {
15384 *pr32Res = g_ar32QNaN[1];
15385 return fMxcsr | X86_MXCSR_IE;
15386 }
15387
15388 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15389 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15390 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15391}
15392
15393
15394IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15395{
15396 RT_NOREF(puSrc1);
15397
15398 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15399 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15400 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15401 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15402}
15403#endif
15404
15405
15406/**
15407 * SQRTSS
15408 */
15409#ifdef IEM_WITHOUT_ASSEMBLY
15410IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15411{
15412 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15413 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15414 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15415 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15416}
15417#endif
15418
15419
15420/**
15421 * SQRTPD
15422 */
15423#ifdef IEM_WITHOUT_ASSEMBLY
15424static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15425{
15426 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15427 return fMxcsr;
15428
15429 RTFLOAT64U r64Src;
15430 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15431 if (RTFLOAT64U_IS_ZERO(&r64Src))
15432 {
15433 *pr64Res = r64Src;
15434 return fMxcsr;
15435 }
15436 else if (r64Src.s.fSign)
15437 {
15438 *pr64Res = g_ar64QNaN[1];
15439 return fMxcsr | X86_MXCSR_IE;
15440 }
15441
15442 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15443 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15444 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15445}
15446
15447
15448IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15449{
15450 RT_NOREF(puSrc1);
15451
15452 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15453 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15454}
15455#endif
15456
15457
15458/**
15459 * SQRTSD
15460 */
15461#ifdef IEM_WITHOUT_ASSEMBLY
15462IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15463{
15464 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15465 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15466}
15467#endif
15468
15469
15470/**
15471 * ADDSUBPS
15472 */
15473#ifdef IEM_WITHOUT_ASSEMBLY
15474IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15475{
15476 RT_NOREF(puSrc1);
15477
15478 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15479 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15480 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15481 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15482}
15483#endif
15484
15485
15486/**
15487 * ADDSUBPD
15488 */
15489#ifdef IEM_WITHOUT_ASSEMBLY
15490IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15491{
15492 RT_NOREF(puSrc1);
15493
15494 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15495 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15496}
15497#endif
15498
15499
15500/**
15501 * CVTPD2PS
15502 */
15503#ifdef IEM_WITHOUT_ASSEMBLY
15504static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15505{
15506 RTFLOAT64U r64Src1;
15507 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15508
15509 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15510 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15511 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15512}
15513
15514
15515IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15516{
15517 RT_NOREF(puSrc1);
15518
15519 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15520 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15521 pResult->uResult.au32[2] = 0;
15522 pResult->uResult.au32[3] = 0;
15523}
15524#endif
15525
15526
15527/**
15528 * CVTPS2PD
15529 */
15530#ifdef IEM_WITHOUT_ASSEMBLY
15531static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15532{
15533 RTFLOAT32U r32Src1;
15534 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15535
15536 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15537 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15538 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15539}
15540
15541
15542IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15543{
15544 RT_NOREF(puSrc1);
15545
15546 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15547 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15548}
15549#endif
15550
15551
15552/**
15553 * CVTDQ2PS
15554 */
15555#ifdef IEM_WITHOUT_ASSEMBLY
15556static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
15557{
15558 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15559 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
15560 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15561}
15562
15563
15564IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15565{
15566 RT_NOREF(puSrc1);
15567
15568 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15569 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15570 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
15571 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
15572}
15573#endif
15574
15575
15576/**
15577 * CVTPS2DQ
15578 */
15579#ifdef IEM_WITHOUT_ASSEMBLY
15580static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15581{
15582 RTFLOAT32U r32Src;
15583 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15584
15585 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15586 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15587 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15588}
15589
15590
15591IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15592{
15593 RT_NOREF(puSrc1);
15594
15595 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15596 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15597 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15598 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15599}
15600#endif
15601
15602
15603/**
15604 * CVTTPS2DQ
15605 */
15606#ifdef IEM_WITHOUT_ASSEMBLY
15607static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15608{
15609 RTFLOAT32U r32Src;
15610 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15611
15612 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15613 SoftState.roundingMode = softfloat_round_minMag;
15614 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
15615 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15616}
15617
15618
15619IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15620{
15621 RT_NOREF(puSrc1);
15622
15623 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15624 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15625 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15626 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15627}
15628#endif
15629
15630
15631/**
15632 * CVTTPD2DQ
15633 */
15634#ifdef IEM_WITHOUT_ASSEMBLY
15635static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15636{
15637 RTFLOAT64U r64Src;
15638 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15639
15640 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15641 SoftState.roundingMode = softfloat_round_minMag;
15642 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15643 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15644}
15645
15646
15647IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15648{
15649 RT_NOREF(puSrc1);
15650
15651 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15652 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15653 pResult->uResult.au64[1] = 0;
15654}
15655#endif
15656
15657
15658/**
15659 * CVTDQ2PD
15660 */
15661#ifdef IEM_WITHOUT_ASSEMBLY
15662static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
15663{
15664 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15665 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
15666 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15667}
15668
15669
15670IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15671{
15672 RT_NOREF(puSrc1);
15673
15674 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15675 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15676}
15677#endif
15678
15679
15680/**
15681 * CVTPD2DQ
15682 */
15683#ifdef IEM_WITHOUT_ASSEMBLY
15684static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15685{
15686 RTFLOAT64U r64Src;
15687 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15688
15689 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15690 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15691 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15692}
15693
15694
15695IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15696{
15697 RT_NOREF(puSrc1);
15698
15699 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15700 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15701 pResult->uResult.au64[1] = 0;
15702}
15703#endif
15704
15705
15706/**
15707 * [V]SHUFPS
15708 */
15709#ifdef IEM_WITHOUT_ASSEMBLY
15710IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15711{
15712 RTUINT128U const uSrc1 = *puDst;
15713 RTUINT128U const uSrc2 = *puSrc;
15714 ASMCompilerBarrier();
15715 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15716 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15717 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15718 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15719}
15720#endif
15721
15722
15723IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15724{
15725 RTUINT128U const uSrc1 = *puSrc1;
15726 RTUINT128U const uSrc2 = *puSrc2;
15727 ASMCompilerBarrier();
15728 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15729 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15730 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15731 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15732}
15733
15734
15735IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15736{
15737 RTUINT256U const uSrc1 = *puSrc1;
15738 RTUINT256U const uSrc2 = *puSrc2;
15739 ASMCompilerBarrier();
15740 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15741 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15742 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15743 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15744
15745 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
15746 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
15747 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
15748 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
15749}
15750
15751
15752/**
15753 * [V]SHUFPD
15754 */
15755#ifdef IEM_WITHOUT_ASSEMBLY
15756IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15757{
15758 RTUINT128U const uSrc1 = *puDst;
15759 RTUINT128U const uSrc2 = *puSrc;
15760 ASMCompilerBarrier();
15761 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15762 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15763}
15764#endif
15765
15766
15767IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15768{
15769 RTUINT128U const uSrc1 = *puSrc1;
15770 RTUINT128U const uSrc2 = *puSrc2;
15771 ASMCompilerBarrier();
15772 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15773 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15774}
15775
15776
15777IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15778{
15779 RTUINT256U const uSrc1 = *puSrc1;
15780 RTUINT256U const uSrc2 = *puSrc2;
15781 ASMCompilerBarrier();
15782 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15783 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15784 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
15785 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
15786}
15787
15788
15789/*
15790 * PHMINPOSUW / VPHMINPOSUW
15791 */
15792IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15793{
15794 uint16_t u16Min = puSrc->au16[0];
15795 uint8_t idxMin = 0;
15796
15797 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
15798 if (puSrc->au16[i] < u16Min)
15799 {
15800 u16Min = puSrc->au16[i];
15801 idxMin = i;
15802 }
15803
15804 puDst->au64[0] = 0;
15805 puDst->au64[1] = 0;
15806 puDst->au16[0] = u16Min;
15807 puDst->au16[1] = idxMin;
15808}
15809
15810
15811IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15812{
15813 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
15814}
15815
15816
15817/*
15818 * [V]PBLENDVB
15819 */
15820IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15821{
15822 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15823 if (puMask->au8[i] & RT_BIT(7))
15824 puDst->au8[i] = puSrc->au8[i];
15825}
15826
15827
15828IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15829{
15830 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15831 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15832}
15833
15834
15835IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15836{
15837 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15838 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15839}
15840
15841
15842/*
15843 * [V]BLENDVPS
15844 */
15845IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15846{
15847 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15848 if (puMask->au32[i] & RT_BIT_32(31))
15849 puDst->au32[i] = puSrc->au32[i];
15850}
15851
15852
15853IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15854{
15855 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15856 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15857}
15858
15859
15860IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15861{
15862 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15863 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15864}
15865
15866
15867/*
15868 * [V]BLENDVPD
15869 */
15870IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15871{
15872 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
15873 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
15874}
15875
15876
15877IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15878{
15879 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15880 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15881}
15882
15883
15884IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15885{
15886 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15887 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15888}
15889
15890
15891/**
15892 * [V]PALIGNR
15893 */
15894IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
15895{
15896 uint64_t const u64Src1 = *pu64Dst;
15897 ASMCompilerBarrier();
15898
15899 if (bEvil >= 16)
15900 *pu64Dst = 0;
15901 else if (bEvil >= 8)
15902 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
15903 else
15904 {
15905 uint8_t cShift = bEvil * 8;
15906 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
15907 | (u64Src2 >> cShift);
15908 }
15909}
15910
15911
15912IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15913{
15914 RTUINT128U const uSrc1 = *puDst;
15915 RTUINT128U const uSrc2 = *puSrc;
15916 ASMCompilerBarrier();
15917
15918 puDst->au64[0] = 0;
15919 puDst->au64[1] = 0;
15920 if (bEvil >= 32)
15921 { /* Everything stays 0. */ }
15922 else if (bEvil >= 16)
15923 {
15924 bEvil -= 16;
15925 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
15926 puDst->au8[i - bEvil] = uSrc1.au8[i];
15927 }
15928 else
15929 {
15930 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
15931 puDst->au8[i] = uSrc2.au8[i + bEvil];
15932 for (uint8_t i = 0; i < bEvil; i++)
15933 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
15934 }
15935}
15936
15937
15938IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15939{
15940 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
15941 RTUINT128U const uSrc2 = *puSrc2;
15942 ASMCompilerBarrier();
15943
15944 puDst->au64[0] = 0;
15945 puDst->au64[1] = 0;
15946 if (bEvil >= 32)
15947 { /* Everything stays 0. */ }
15948 else if (bEvil >= 16)
15949 {
15950 bEvil -= 16;
15951 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
15952 puDst->au8[i - bEvil] = uSrc1.au8[i];
15953 }
15954 else
15955 {
15956 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
15957 puDst->au8[i] = uSrc2.au8[i + bEvil];
15958 for (uint8_t i = 0; i < bEvil; i++)
15959 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
15960 }
15961}
15962
15963
15964IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15965{
15966 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
15967 RTUINT256U const uSrc2 = *puSrc2;
15968 ASMCompilerBarrier();
15969
15970 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
15971 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
15972}
15973
15974
15975/**
15976 * [V]PBLENDW
15977 */
15978IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15979{
15980 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
15981 if (bEvil & RT_BIT(i))
15982 puDst->au16[i] = puSrc->au16[i];
15983}
15984
15985
15986IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15987{
15988 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
15989 if (bEvil & RT_BIT(i))
15990 puDst->au16[i] = puSrc2->au16[i];
15991 else
15992 puDst->au16[i] = puSrc1->au16[i];
15993}
15994
15995
15996IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15997{
15998 for (uint8_t i = 0; i < 8; i++)
15999 if (bEvil & RT_BIT(i))
16000 {
16001 puDst->au16[ i] = puSrc2->au16[ i];
16002 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16003 }
16004 else
16005 {
16006 puDst->au16[ i] = puSrc1->au16[ i];
16007 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16008 }
16009}
16010
16011
16012/**
16013 * [V]BLENDPS
16014 */
16015IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16016{
16017 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16018 if (bEvil & RT_BIT(i))
16019 puDst->au32[i] = puSrc->au32[i];
16020}
16021
16022
16023IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16024{
16025 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16026 if (bEvil & RT_BIT(i))
16027 puDst->au32[i] = puSrc2->au32[i];
16028 else
16029 puDst->au32[i] = puSrc1->au32[i];
16030}
16031
16032
16033IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16034{
16035 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16036 if (bEvil & RT_BIT(i))
16037 puDst->au32[i] = puSrc2->au32[i];
16038 else
16039 puDst->au32[i] = puSrc1->au32[i];
16040}
16041
16042
16043/**
16044 * [V]BLENDPD
16045 */
16046IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16047{
16048 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16049 if (bEvil & RT_BIT(i))
16050 puDst->au64[i] = puSrc->au64[i];
16051}
16052
16053
16054IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16055{
16056 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16057 if (bEvil & RT_BIT(i))
16058 puDst->au64[i] = puSrc2->au64[i];
16059 else
16060 puDst->au64[i] = puSrc1->au64[i];
16061}
16062
16063
16064IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16065{
16066 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16067 if (bEvil & RT_BIT(i))
16068 puDst->au64[i] = puSrc2->au64[i];
16069 else
16070 puDst->au64[i] = puSrc1->au64[i];
16071}
16072
16073
16074/**
16075 * [V]PCMPISTRI
16076 */
16077IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRISRC pSrc, uint8_t bEvil))
16078{
16079 RT_NOREF(pu32Ecx, pEFlags, pSrc, bEvil);
16080 AssertReleaseFailed();
16081}
16082
16083
16084/*
16085 * [V]PCLMULQDQ
16086 */
16087IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16088{
16089 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
16090}
16091
16092
16093IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16094{
16095 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
16096 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
16097
16098 puDst->au64[0] = 0;
16099 puDst->au64[1] = 0;
16100
16101 /*
16102 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
16103 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
16104 * and squeeze out some optimizations.
16105 */
16106 if (uSrc1 & 0x1)
16107 puDst->au64[0] = uSrc2;
16108
16109 uSrc1 >>= 1;
16110
16111 uint8_t iDigit = 1;
16112 while (uSrc1)
16113 {
16114 if (uSrc1 & 0x1)
16115 {
16116 puDst->au64[0] ^= (uSrc2 << iDigit);
16117 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
16118 }
16119
16120 uSrc1 >>= 1;
16121 iDigit++;
16122 }
16123}
16124
16125
16126/**
16127 * [V]PINSRW
16128 */
16129#ifdef IEM_WITHOUT_ASSEMBLY
16130IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
16131{
16132 uint8_t cShift = (bEvil & 0x3) * 16;
16133 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
16134}
16135
16136
16137IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
16138{
16139 puDst->au16[bEvil & 0x7] = u16Src;
16140}
16141#endif
16142
16143
16144IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
16145{
16146 *puDst = *puSrc;
16147 puDst->au16[bEvil & 0x7] = u16Src;
16148}
16149
16150
16151/**
16152 * [V]PEXTRW
16153 */
16154#ifdef IEM_WITHOUT_ASSEMBLY
16155IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
16156{
16157 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
16158}
16159
16160
16161IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16162{
16163 *pu16Dst = puSrc->au16[bEvil & 0x7];
16164}
16165
16166#endif
16167
16168IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16169{
16170 *pu16Dst = puSrc->au16[bEvil & 0x7];
16171}
16172
16173
16174/**
16175 * [V]MOVMSKPS
16176 */
16177#ifdef IEM_WITHOUT_ASSEMBLY
16178IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16179{
16180 *pu8Dst = puSrc->au32[0] >> 31;
16181 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16182 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16183 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16184}
16185
16186#endif
16187
16188IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16189{
16190 *pu8Dst = puSrc->au32[0] >> 31;
16191 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16192 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16193 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16194}
16195
16196
16197IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16198{
16199 *pu8Dst = puSrc->au32[0] >> 31;
16200 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16201 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16202 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16203 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
16204 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
16205 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
16206 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
16207}
16208
16209
16210/**
16211 * [V]MOVMSKPD
16212 */
16213#ifdef IEM_WITHOUT_ASSEMBLY
16214IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16215{
16216 *pu8Dst = puSrc->au64[0] >> 63;
16217 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16218}
16219
16220#endif
16221
16222IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16223{
16224 *pu8Dst = puSrc->au64[0] >> 63;
16225 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16226}
16227
16228
16229IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16230{
16231 *pu8Dst = puSrc->au64[0] >> 63;
16232 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16233 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
16234 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
16235}
16236
16237
16238/**
16239 * CVTTSD2SI
16240 */
16241#ifdef IEM_WITHOUT_ASSEMBLY
16242IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16243{
16244 RTFLOAT64U r64Src;
16245
16246 r64Src.u = *pu64Src;
16247 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16248
16249 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16250 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16251 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16252}
16253
16254
16255IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16256{
16257 RTFLOAT64U r64Src;
16258
16259 r64Src.u = *pu64Src;
16260 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16261
16262 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16263 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16264 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16265}
16266#endif
16267
16268
16269/**
16270 * CVTSD2SI
16271 */
16272#ifdef IEM_WITHOUT_ASSEMBLY
16273IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16274{
16275 RTFLOAT64U r64Src;
16276
16277 r64Src.u = *pu64Src;
16278 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16279
16280 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16281 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16282 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16283}
16284
16285
16286IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16287{
16288 RTFLOAT64U r64Src;
16289
16290 r64Src.u = *pu64Src;
16291 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16292
16293 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16294 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16295 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16296}
16297#endif
16298
16299
16300/**
16301 * CVTTSS2SI
16302 */
16303#ifdef IEM_WITHOUT_ASSEMBLY
16304IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
16305{
16306 RTFLOAT32U r32Src;
16307
16308 r32Src.u = *pu32Src;
16309 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16310
16311 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16312 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16313 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16314}
16315
16316
16317IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
16318{
16319 RTFLOAT32U r32Src;
16320
16321 r32Src.u = *pu32Src;
16322 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16323
16324 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16325 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16326 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16327}
16328#endif
16329
16330
16331/**
16332 * CVTSS2SI
16333 */
16334#ifdef IEM_WITHOUT_ASSEMBLY
16335IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
16336{
16337 RTFLOAT32U r32Src;
16338
16339 r32Src.u = *pu32Src;
16340 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16341
16342 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16343 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16344 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16345}
16346
16347
16348IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
16349{
16350 RTFLOAT32U r32Src;
16351
16352 r32Src.u = *pu32Src;
16353 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16354
16355 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16356 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16357 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16358}
16359#endif
16360
16361
16362/**
16363 * CVTSI2SD
16364 */
16365#ifdef IEM_WITHOUT_ASSEMBLY
16366IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
16367{
16368 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16369 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
16370 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
16371}
16372
16373
16374IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
16375{
16376 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16377 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
16378 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
16379}
16380#endif
16381
16382
16383/**
16384 * CVTSI2SS
16385 */
16386#ifdef IEM_WITHOUT_ASSEMBLY
16387IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
16388{
16389 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16390 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
16391 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
16392}
16393
16394
16395IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
16396{
16397 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16398 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
16399 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
16400}
16401#endif
16402
16403
16404/**
16405 * [V]UCOMISS
16406 */
16407#ifdef IEM_WITHOUT_ASSEMBLY
16408IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16409{
16410 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16411
16412 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
16413 {
16414 *pfMxcsr |= X86_MXCSR_IE;
16415 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16416 }
16417 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
16418 {
16419 /* ucomiss doesn't raise \#IE for quiet NaNs. */
16420 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16421 }
16422 else
16423 {
16424 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16425
16426 RTFLOAT32U r32Src1, r32Src2;
16427 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
16428 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
16429
16430 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16431 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16432 if (f32_eq(f32Src1, f32Src2, &SoftState))
16433 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16434 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16435 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16436 /* else: GREATER_THAN 000 */
16437
16438 *pfMxcsr |= fDe;
16439 }
16440
16441 *pfEFlags = fEFlagsNew;
16442}
16443#endif
16444
16445IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16446{
16447 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16448}
16449
16450
16451/**
16452 * [V]UCOMISD
16453 */
16454#ifdef IEM_WITHOUT_ASSEMBLY
16455IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16456{
16457 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16458
16459 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
16460 {
16461 *pfMxcsr |= X86_MXCSR_IE;
16462 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16463 }
16464 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
16465 {
16466 /* ucomiss doesn't raise \#IE for quiet NaNs. */
16467 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16468 }
16469 else
16470 {
16471 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16472
16473 RTFLOAT64U r64Src1, r64Src2;
16474 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
16475 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
16476
16477 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16478 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16479 if (f64_eq(f64Src1, f64Src2, &SoftState))
16480 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16481 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16482 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16483 /* else: GREATER_THAN 000 */
16484
16485 *pfMxcsr |= fDe;
16486 }
16487
16488 *pfEFlags = fEFlagsNew;
16489}
16490#endif
16491
16492IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16493{
16494 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16495}
16496
16497
16498/**
16499 * [V]COMISS
16500 */
16501#ifdef IEM_WITHOUT_ASSEMBLY
16502IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16503{
16504 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16505
16506 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
16507 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
16508 {
16509 *pfMxcsr |= X86_MXCSR_IE;
16510 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16511 }
16512 else
16513 {
16514 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16515
16516 RTFLOAT32U r32Src1, r32Src2;
16517 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
16518 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
16519
16520 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16521 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16522 if (f32_eq(f32Src1, f32Src2, &SoftState))
16523 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16524 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16525 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16526 /* else: GREATER_THAN 000 */
16527
16528 *pfMxcsr |= fDe;
16529 }
16530
16531 *pfEFlags = fEFlagsNew;
16532}
16533#endif
16534
16535
16536IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16537{
16538 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16539}
16540
16541
16542/**
16543 * [V]COMISD
16544 */
16545#ifdef IEM_WITHOUT_ASSEMBLY
16546IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16547{
16548 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16549
16550 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
16551 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
16552 {
16553 *pfMxcsr |= X86_MXCSR_IE;
16554 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16555 }
16556 else
16557 {
16558 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16559
16560 RTFLOAT64U r64Src1, r64Src2;
16561 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
16562 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
16563
16564 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16565 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16566 if (f64_eq(f64Src1, f64Src2, &SoftState))
16567 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16568 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16569 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16570 /* else: GREATER_THAN 000 */
16571
16572 *pfMxcsr |= fDe;
16573 }
16574
16575 *pfEFlags = fEFlagsNew;
16576}
16577#endif
16578
16579IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16580{
16581 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16582}
16583
16584
16585/**
16586 * CMPPS / CMPPD / CMPSS / CMPSD
16587 */
16588#ifdef IEM_WITHOUT_ASSEMBLY
16589/**
16590 * A compare truth table entry.
16591 */
16592typedef struct CMPTRUTHTBLENTRY
16593{
16594 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
16595 bool fSignalsOnQNan;
16596 /** The boolean result when the input operands are unordered. */
16597 bool fUnordered;
16598 /** The boolean result when A = B. */
16599 bool fEqual;
16600 /** The boolean result when A < B. */
16601 bool fLowerThan;
16602 /** The boolean result when A > B. */
16603 bool fGreaterThan;
16604} CMPTRUTHTBLENTRY;
16605/** Pointer to a const truth table entry. */
16606typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
16607
16608
16609/** The compare truth table (indexed by immediate). */
16610static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
16611{
16612 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
16613 /* 00H (EQ_OQ) */ { false, false, true, false, false },
16614 /* 01H (LT_OS) */ { true, false, false, true, false },
16615 /* 02H (LE_OS) */ { true, false, true, true, false },
16616 /* 03H (UNORD_Q) */ { false, true, false, false, false },
16617 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
16618 /* 05H (NLT_US) */ { true, true, true, false, true },
16619 /* 06H (NLE_US) */ { true, true, false, false, true },
16620 /* 07H (ORQ_Q) */ { false, false, true, true, true },
16621 /** @todo AVX variants. */
16622};
16623
16624
16625static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
16626{
16627 bool fRes;
16628 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
16629
16630 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
16631 {
16632 *pfMxcsr |= X86_MXCSR_IE;
16633 fRes = g_aCmpTbl[bEvil].fUnordered;
16634 }
16635 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
16636 {
16637 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
16638 *pfMxcsr |= X86_MXCSR_IE;
16639 fRes = g_aCmpTbl[bEvil].fUnordered;
16640 }
16641 else
16642 {
16643 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16644
16645 RTFLOAT32U r32Src1, r32Src2;
16646 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
16647 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
16648
16649 *pfMxcsr |= fDe;
16650 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16651 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16652 if (f32_eq(f32Src1, f32Src2, &SoftState))
16653 fRes = g_aCmpTbl[bEvil].fEqual;
16654 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16655 fRes = g_aCmpTbl[bEvil].fLowerThan;
16656 else
16657 fRes = g_aCmpTbl[bEvil].fGreaterThan;
16658 }
16659
16660 return fRes;
16661}
16662
16663
16664static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
16665{
16666 bool fRes;
16667 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
16668
16669 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
16670 {
16671 *pfMxcsr |= X86_MXCSR_IE;
16672 fRes = g_aCmpTbl[bEvil].fUnordered;
16673 }
16674 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
16675 {
16676 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
16677 *pfMxcsr |= X86_MXCSR_IE;
16678 fRes = g_aCmpTbl[bEvil].fUnordered;
16679 }
16680 else
16681 {
16682 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16683
16684 RTFLOAT64U r64Src1, r64Src2;
16685 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1);
16686 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
16687
16688 *pfMxcsr |= fDe;
16689 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16690 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16691 if (f64_eq(f64Src1, f64Src2, &SoftState))
16692 fRes = g_aCmpTbl[bEvil].fEqual;
16693 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16694 fRes = g_aCmpTbl[bEvil].fLowerThan;
16695 else
16696 fRes = g_aCmpTbl[bEvil].fGreaterThan;
16697 }
16698
16699 return fRes;
16700}
16701
16702
16703IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
16704{
16705 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
16706 {
16707 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
16708 puDst->au32[i] = UINT32_MAX;
16709 else
16710 puDst->au32[i] = 0;
16711 }
16712}
16713
16714
16715IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
16716{
16717 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
16718 {
16719 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
16720 puDst->au64[i] = UINT64_MAX;
16721 else
16722 puDst->au64[i] = 0;
16723 }
16724}
16725
16726
16727IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
16728{
16729 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
16730 puDst->au32[0] = UINT32_MAX;
16731 else
16732 puDst->au32[0] = 0;
16733
16734 puDst->au32[1] = pSrc->uSrc1.au32[1];
16735 puDst->au64[1] = pSrc->uSrc1.au64[1];
16736}
16737
16738
16739IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
16740{
16741 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
16742 puDst->au64[0] = UINT64_MAX;
16743 else
16744 puDst->au64[0] = 0;
16745
16746 puDst->au64[1] = pSrc->uSrc1.au64[1];
16747}
16748#endif
16749
16750
16751/**
16752 * CVTPD2PI
16753 */
16754#ifdef IEM_WITHOUT_ASSEMBLY
16755static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
16756{
16757 RTFLOAT64U r64Src;
16758 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
16759
16760 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16761 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16762 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16763}
16764
16765
16766IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
16767{
16768 RTUINT64U u64Res;
16769 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
16770 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
16771
16772 *pu64Dst = u64Res.u;
16773 *pfMxcsr = fMxcsrOut;
16774}
16775#endif
16776
16777
16778/**
16779 * CVTTPD2PI
16780 */
16781#ifdef IEM_WITHOUT_ASSEMBLY
16782static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
16783{
16784 RTFLOAT64U r64Src;
16785 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
16786
16787 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16788 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16789 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16790}
16791
16792
16793IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
16794{
16795 RTUINT64U u64Res;
16796 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
16797 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
16798
16799 *pu64Dst = u64Res.u;
16800 *pfMxcsr = fMxcsrOut;
16801}
16802#endif
16803
16804
16805/**
16806 * CVTPI2PS
16807 */
16808#ifdef IEM_WITHOUT_ASSEMBLY
16809static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
16810{
16811 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16812 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
16813 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
16814}
16815
16816
16817IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
16818{
16819 RTUINT64U uSrc = { u64Src };
16820 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
16821 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
16822 *pfMxcsr = fMxcsrOut;
16823}
16824#endif
16825
16826
16827/**
16828 * CVTPI2PD
16829 */
16830#ifdef IEM_WITHOUT_ASSEMBLY
16831static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
16832{
16833 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16834 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
16835 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
16836}
16837
16838
16839IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
16840{
16841 RTUINT64U uSrc = { u64Src };
16842 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
16843 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
16844 *pfMxcsr = fMxcsrOut;
16845}
16846#endif
16847
16848
16849/**
16850 * CVTPS2PI
16851 */
16852#ifdef IEM_WITHOUT_ASSEMBLY
16853static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
16854{
16855 RTFLOAT32U r32Src;
16856 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
16857
16858 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16859 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16860 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16861}
16862
16863
16864IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
16865{
16866 RTUINT64U uDst;
16867 RTUINT64U uSrc = { u64Src };
16868 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
16869 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
16870 *pu64Dst = uDst.u;
16871 *pfMxcsr = fMxcsrOut;
16872}
16873#endif
16874
16875
16876/**
16877 * CVTTPS2PI
16878 */
16879#ifdef IEM_WITHOUT_ASSEMBLY
16880static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
16881{
16882 RTFLOAT32U r32Src;
16883 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
16884
16885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16886 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16887 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16888}
16889
16890
16891IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
16892{
16893 RTUINT64U uDst;
16894 RTUINT64U uSrc = { u64Src };
16895 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
16896 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
16897 *pu64Dst = uDst.u;
16898 *pfMxcsr = fMxcsrOut;
16899}
16900#endif
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette