VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 102020

Last change on this file since 102020 was 101378, checked in by vboxsync, 14 months ago

VMM/IEM: Implement vmpsadbw instruction emulations [review fix], bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 706.2 KB
Line 
1/* $Id: IEMAllAImplC.cpp 101378 2023-10-06 08:38:56Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t const *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6124{
6125 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6126 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6127 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6128 extFloat80_t v;
6129 (void)fFcw;
6130
6131 v = extF80_atan2(y, x, &SoftState);
6132
6133 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6134 return fFsw;
6135}
6136
6137IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6138 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6139{
6140 uint16_t const fFcw = pFpuState->FCW;
6141 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6142
6143 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6144 {
6145 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6146
6147 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6148 if (!(fFcw & X86_FCW_PM))
6149 fFsw |= X86_FSW_ES | X86_FSW_B;
6150 }
6151 else
6152 {
6153 fFsw |= X86_FSW_IE;
6154 if (!(fFcw & X86_FCW_IM))
6155 {
6156 pFpuRes->r80Result = *pr80Val2;
6157 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6158 }
6159 else
6160 {
6161 pFpuRes->r80Result = g_r80Indefinite;
6162 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6163 }
6164 }
6165
6166 pFpuRes->FSW = fFsw;
6167}
6168#endif /* IEM_WITHOUT_ASSEMBLY */
6169
6170IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6171 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6172{
6173 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6174}
6175
6176IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6177 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6178{
6179 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6180}
6181
6182
6183#if defined(IEM_WITHOUT_ASSEMBLY)
6184static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6185{
6186 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6187 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6188 extFloat80_t v;
6189 (void)fFcw;
6190
6191 v = extF80_tan(x, &SoftState);
6192
6193 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6194 return fFsw;
6195}
6196
6197IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6198{
6199 uint16_t const fFcw = pFpuState->FCW;
6200 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6201
6202 if (RTFLOAT80U_IS_ZERO(pr80Val))
6203 {
6204 pFpuResTwo->r80Result1 = *pr80Val;
6205 pFpuResTwo->r80Result2 = g_ar80One[0];
6206 }
6207 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6208 {
6209 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6210 {
6211 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6212 pFpuResTwo->r80Result1 = *pr80Val;
6213 }
6214 else
6215 {
6216 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6217 {
6218 pFpuResTwo->r80Result1 = *pr80Val;
6219 }
6220 else
6221 {
6222 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6223 }
6224
6225 pFpuResTwo->r80Result2 = g_ar80One[0];
6226
6227 fFsw |= X86_FSW_PE;
6228 if (!(fFcw & X86_FCW_PM))
6229 fFsw |= X86_FSW_ES | X86_FSW_B;
6230 }
6231 }
6232 else
6233 {
6234 fFsw |= X86_FSW_IE;
6235 if (!(fFcw & X86_FCW_IM))
6236 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6237 }
6238
6239 pFpuResTwo->FSW = fFsw;
6240}
6241#endif /* IEM_WITHOUT_ASSEMBLY */
6242
6243IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6244{
6245 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6246}
6247
6248IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6249{
6250 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6251}
6252
6253#ifdef IEM_WITHOUT_ASSEMBLY
6254
6255static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6256{
6257 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6258 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6259 extFloat80_t v;
6260 (void)fFcw;
6261
6262 v = extF80_sin(x, &SoftState);
6263
6264 iemFpuSoftF80ToIprt(pr80Result, v);
6265
6266 return fFsw;
6267}
6268
6269IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6270{
6271 uint16_t const fFcw = pFpuState->FCW;
6272 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6273
6274 if (RTFLOAT80U_IS_ZERO(pr80Val))
6275 {
6276 pFpuRes->r80Result = *pr80Val;
6277 }
6278 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6279 {
6280 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6281 {
6282 fFsw |= X86_FSW_C2;
6283 pFpuRes->r80Result = *pr80Val;
6284 }
6285 else
6286 {
6287 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6288 {
6289 pFpuRes->r80Result = *pr80Val;
6290 }
6291 else
6292 {
6293 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6294 }
6295 fFsw |= X86_FSW_PE;
6296 if (!(fFcw & X86_FCW_PM))
6297 fFsw |= X86_FSW_ES | X86_FSW_B;
6298 }
6299 }
6300 else if (RTFLOAT80U_IS_INF(pr80Val))
6301 {
6302 fFsw |= X86_FSW_IE;
6303 if (!(fFcw & X86_FCW_IM))
6304 {
6305 fFsw |= X86_FSW_ES | X86_FSW_B;
6306 pFpuRes->r80Result = *pr80Val;
6307 }
6308 else
6309 {
6310 pFpuRes->r80Result = g_r80Indefinite;
6311 }
6312 }
6313 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6314 {
6315 fFsw |= X86_FSW_DE;
6316
6317 if (fFcw & X86_FCW_DM)
6318 {
6319 if (fFcw & X86_FCW_UM)
6320 {
6321 pFpuRes->r80Result = *pr80Val;
6322 }
6323 else
6324 {
6325 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6326 uint64_t uMantissa = pr80Val->s.uMantissa;
6327 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6328
6329 uExponent = 64 - uExponent;
6330 uMantissa <<= uExponent;
6331 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6332
6333 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6334 pFpuRes->r80Result.s.uMantissa = uMantissa;
6335 pFpuRes->r80Result.s.uExponent = uExponent;
6336 }
6337
6338 fFsw |= X86_FSW_UE | X86_FSW_PE;
6339
6340 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6341 {
6342 /* All the exceptions are masked. */
6343 }
6344 else
6345 {
6346 fFsw |= X86_FSW_ES | X86_FSW_B;
6347 }
6348 }
6349 else
6350 {
6351 pFpuRes->r80Result = *pr80Val;
6352
6353 fFsw |= X86_FSW_ES | X86_FSW_B;
6354 }
6355 }
6356 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6357 {
6358 pFpuRes->r80Result = *pr80Val;
6359 fFsw |= X86_FSW_DE;
6360
6361 if (fFcw & X86_FCW_DM)
6362 {
6363 if (fFcw & X86_FCW_PM)
6364 {
6365 fFsw |= X86_FSW_PE;
6366 }
6367 else
6368 {
6369 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6370 }
6371
6372 pFpuRes->r80Result.sj64.uExponent = 1;
6373 }
6374 else
6375 {
6376 fFsw |= X86_FSW_ES | X86_FSW_B;
6377 }
6378 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6379 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6380 {
6381 pFpuRes->r80Result = *pr80Val;
6382 } else {
6383 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6384 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6385 && (fFcw & X86_FCW_IM))
6386 pFpuRes->r80Result = g_r80Indefinite;
6387 else
6388 {
6389 pFpuRes->r80Result = *pr80Val;
6390 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6391 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6392 }
6393
6394 fFsw |= X86_FSW_IE;
6395 if (!(fFcw & X86_FCW_IM))
6396 fFsw |= X86_FSW_ES | X86_FSW_B;
6397 }
6398
6399 pFpuRes->FSW = fFsw;
6400}
6401#endif /* IEM_WITHOUT_ASSEMBLY */
6402
6403IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6404{
6405 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6406}
6407
6408IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6409{
6410 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6411}
6412
6413#ifdef IEM_WITHOUT_ASSEMBLY
6414
6415static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6416{
6417 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6418 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6419 extFloat80_t v;
6420 (void)fFcw;
6421
6422 v = extF80_cos(x, &SoftState);
6423
6424 iemFpuSoftF80ToIprt(pr80Result, v);
6425
6426 return fFsw;
6427}
6428
6429IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6430{
6431 uint16_t const fFcw = pFpuState->FCW;
6432 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6433
6434 if (RTFLOAT80U_IS_ZERO(pr80Val))
6435 {
6436 pFpuRes->r80Result = g_ar80One[0];
6437 }
6438 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6439 {
6440 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6441 {
6442 fFsw |= X86_FSW_C2;
6443 pFpuRes->r80Result = *pr80Val;
6444 }
6445 else
6446 {
6447 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6448 {
6449 pFpuRes->r80Result = g_ar80One[0];
6450
6451 }
6452 else
6453 {
6454 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6455 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6456 }
6457 fFsw |= X86_FSW_PE;
6458 if (!(fFcw & X86_FCW_PM))
6459 fFsw |= X86_FSW_ES | X86_FSW_B;
6460 }
6461 }
6462 else if (RTFLOAT80U_IS_INF(pr80Val))
6463 {
6464 fFsw |= X86_FSW_IE;
6465 if (!(fFcw & X86_FCW_IM))
6466 {
6467 fFsw |= X86_FSW_ES | X86_FSW_B;
6468 pFpuRes->r80Result = *pr80Val;
6469 }
6470 else
6471 {
6472 pFpuRes->r80Result = g_r80Indefinite;
6473 }
6474 }
6475 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6476 {
6477 fFsw |= X86_FSW_DE;
6478
6479 if (fFcw & X86_FCW_DM)
6480 {
6481 pFpuRes->r80Result = g_ar80One[0];
6482
6483 if (fFcw & X86_FCW_PM)
6484 {
6485 fFsw |= X86_FSW_PE;
6486 }
6487 else
6488 {
6489 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6490 }
6491 }
6492 else
6493 {
6494 pFpuRes->r80Result = *pr80Val;
6495 fFsw |= X86_FSW_ES | X86_FSW_B;
6496 }
6497 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6498 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6499 {
6500 pFpuRes->r80Result = *pr80Val;
6501 } else {
6502 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6503 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6504 && (fFcw & X86_FCW_IM))
6505 pFpuRes->r80Result = g_r80Indefinite;
6506 else
6507 {
6508 pFpuRes->r80Result = *pr80Val;
6509 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6510 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6511 }
6512
6513 fFsw |= X86_FSW_IE;
6514 if (!(fFcw & X86_FCW_IM))
6515 fFsw |= X86_FSW_ES | X86_FSW_B;
6516 }
6517
6518 pFpuRes->FSW = fFsw;
6519}
6520#endif /* IEM_WITHOUT_ASSEMBLY */
6521
6522IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6523{
6524 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6525}
6526
6527IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6528{
6529 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6530}
6531
6532#ifdef IEM_WITHOUT_ASSEMBLY
6533
6534static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6535{
6536 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6537 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6538 extFloat80_t r80Sin, r80Cos;
6539 (void)fFcw;
6540
6541 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6542
6543 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6544 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6545
6546 return fFsw;
6547}
6548
6549IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6550{
6551 uint16_t const fFcw = pFpuState->FCW;
6552 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6553
6554 if (RTFLOAT80U_IS_ZERO(pr80Val))
6555 {
6556 pFpuResTwo->r80Result1 = *pr80Val;
6557 pFpuResTwo->r80Result2 = g_ar80One[0];
6558 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6559 }
6560 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6561 {
6562 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6563 {
6564 fFsw |= X86_FSW_C2;
6565
6566 if (fFcw & X86_FCW_IM)
6567 {
6568 pFpuResTwo->r80Result1 = g_r80Indefinite;
6569 }
6570 else
6571 {
6572 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6573 }
6574
6575 pFpuResTwo->r80Result2 = *pr80Val;
6576 }
6577 else
6578 {
6579 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6580
6581 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6582 {
6583 pFpuResTwo->r80Result1 = *pr80Val;
6584 pFpuResTwo->r80Result2 = g_ar80One[0];
6585 }
6586 else
6587 {
6588 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6589 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6590 }
6591 fFsw |= X86_FSW_PE;
6592 if (!(fFcw & X86_FCW_PM))
6593 fFsw |= X86_FSW_ES | X86_FSW_B;
6594 }
6595 }
6596 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6597 {
6598 fFsw |= X86_FSW_DE;
6599
6600 if (fFcw & X86_FCW_DM)
6601 {
6602 pFpuResTwo->r80Result1 = *pr80Val;
6603 pFpuResTwo->r80Result2 = g_ar80One[0];
6604 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6605
6606 if (fFcw & X86_FCW_PM)
6607 {
6608 fFsw |= X86_FSW_PE;
6609 }
6610 else
6611 {
6612 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6613 }
6614
6615 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6616 }
6617 else
6618 {
6619 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6620 pFpuResTwo->r80Result2 = *pr80Val;
6621 fFsw |= X86_FSW_ES | X86_FSW_B;
6622 }
6623 }
6624 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6625 {
6626 fFsw |= X86_FSW_DE;
6627
6628 if (fFcw & X86_FCW_DM)
6629 {
6630 pFpuResTwo->r80Result2 = g_ar80One[0];
6631
6632 if (fFcw & X86_FCW_UM)
6633 {
6634 pFpuResTwo->r80Result1 = *pr80Val;
6635 }
6636 else
6637 {
6638 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6639 uint64_t uMantissa = pr80Val->s.uMantissa;
6640 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6641
6642 uExponent = 64 - uExponent;
6643 uMantissa <<= uExponent;
6644 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6645
6646 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6647 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6648 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6649 }
6650
6651 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6652 fFsw |= X86_FSW_UE | X86_FSW_PE;
6653
6654 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6655 {
6656 /* All the exceptions are masked. */
6657 }
6658 else
6659 {
6660 fFsw |= X86_FSW_ES | X86_FSW_B;
6661 }
6662 }
6663 else
6664 {
6665 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6666 pFpuResTwo->r80Result2 = *pr80Val;
6667 fFsw |= X86_FSW_ES | X86_FSW_B;
6668 }
6669 }
6670 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6671 {
6672 pFpuResTwo->r80Result1 = *pr80Val;
6673 pFpuResTwo->r80Result2 = *pr80Val;
6674 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6675 }
6676 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6677 {
6678 if (fFcw & X86_FCW_IM)
6679 {
6680 pFpuResTwo->r80Result1 = g_r80Indefinite;
6681 pFpuResTwo->r80Result2 = g_r80Indefinite;
6682 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6683 }
6684 else
6685 {
6686 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6687 pFpuResTwo->r80Result2 = *pr80Val;
6688 }
6689
6690 fFsw |= X86_FSW_IE;
6691 if (!(fFcw & X86_FCW_IM))
6692 fFsw |= X86_FSW_ES | X86_FSW_B;
6693 }
6694 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6695 {
6696 pFpuResTwo->r80Result1 = *pr80Val;
6697 pFpuResTwo->r80Result2 = *pr80Val;
6698
6699 if (fFcw & X86_FCW_IM)
6700 {
6701 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6702 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6703 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6704 }
6705 else
6706 {
6707 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6708 pFpuResTwo->r80Result2 = *pr80Val;
6709 }
6710
6711 fFsw |= X86_FSW_IE;
6712 if (!(fFcw & X86_FCW_IM))
6713 fFsw |= X86_FSW_ES | X86_FSW_B;
6714 }
6715 else if (RTFLOAT80U_IS_INF(pr80Val))
6716 {
6717 if (fFcw & X86_FCW_IM)
6718 {
6719 pFpuResTwo->r80Result1 = g_r80Indefinite;
6720 pFpuResTwo->r80Result2 = g_r80Indefinite;
6721 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6722 }
6723 else
6724 {
6725 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6726 pFpuResTwo->r80Result2 = *pr80Val;
6727 }
6728
6729 fFsw |= X86_FSW_IE;
6730 if (!(fFcw & X86_FCW_IM))
6731 fFsw |= X86_FSW_ES | X86_FSW_B;
6732 }
6733
6734 pFpuResTwo->FSW = fFsw;
6735}
6736#endif /* IEM_WITHOUT_ASSEMBLY */
6737
6738IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6739{
6740 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6741}
6742
6743IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6744{
6745 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6746}
6747
6748#ifdef IEM_WITHOUT_ASSEMBLY
6749
6750
6751/*********************************************************************************************************************************
6752* x87 FPU Compare and Testing Operations *
6753*********************************************************************************************************************************/
6754
6755IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6756{
6757 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6758
6759 if (RTFLOAT80U_IS_ZERO(pr80Val))
6760 fFsw |= X86_FSW_C3;
6761 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6762 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6763 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6764 {
6765 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6766 if (!(pFpuState->FCW & X86_FCW_DM))
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 else
6770 {
6771 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6772 if (!(pFpuState->FCW & X86_FCW_IM))
6773 fFsw |= X86_FSW_ES | X86_FSW_B;
6774 }
6775
6776 *pu16Fsw = fFsw;
6777}
6778
6779
6780IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6781{
6782 RT_NOREF(pFpuState);
6783 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6784
6785 /* C1 = sign bit (always, even if empty Intel says). */
6786 if (pr80Val->s.fSign)
6787 fFsw |= X86_FSW_C1;
6788
6789 /* Classify the value in C0, C2, C3. */
6790 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6791 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6792 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6793 fFsw |= X86_FSW_C2;
6794 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6795 fFsw |= X86_FSW_C3;
6796 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6797 fFsw |= X86_FSW_C0;
6798 else if (RTFLOAT80U_IS_INF(pr80Val))
6799 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6800 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6801 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6802 /* whatever else: 0 */
6803
6804 *pu16Fsw = fFsw;
6805}
6806
6807
6808/**
6809 * Worker for fcom, fucom, and friends.
6810 */
6811static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6812 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6813{
6814 /*
6815 * Unpack the values.
6816 */
6817 bool const fSign1 = pr80Val1->s.fSign;
6818 int32_t iExponent1 = pr80Val1->s.uExponent;
6819 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6820
6821 bool const fSign2 = pr80Val2->s.fSign;
6822 int32_t iExponent2 = pr80Val2->s.uExponent;
6823 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6824
6825 /*
6826 * Check for invalid inputs.
6827 */
6828 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6829 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6830 {
6831 if (!(fFcw & X86_FCW_IM))
6832 fFsw |= X86_FSW_ES | X86_FSW_B;
6833 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6834 }
6835
6836 /*
6837 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6838 */
6839 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6840 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6841 {
6842 if ( fIeOnAllNaNs
6843 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6844 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6845 {
6846 fFsw |= X86_FSW_IE;
6847 if (!(fFcw & X86_FCW_IM))
6848 fFsw |= X86_FSW_ES | X86_FSW_B;
6849 }
6850 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6851 }
6852
6853 /*
6854 * Normalize the values.
6855 */
6856 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6857 {
6858 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6859 iExponent1 = 1;
6860 else
6861 {
6862 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6863 uMantissa1 <<= iExponent1;
6864 iExponent1 = 1 - iExponent1;
6865 }
6866 fFsw |= X86_FSW_DE;
6867 if (!(fFcw & X86_FCW_DM))
6868 fFsw |= X86_FSW_ES | X86_FSW_B;
6869 }
6870
6871 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6872 {
6873 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6874 iExponent2 = 1;
6875 else
6876 {
6877 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6878 uMantissa2 <<= iExponent2;
6879 iExponent2 = 1 - iExponent2;
6880 }
6881 fFsw |= X86_FSW_DE;
6882 if (!(fFcw & X86_FCW_DM))
6883 fFsw |= X86_FSW_ES | X86_FSW_B;
6884 }
6885
6886 /*
6887 * Test if equal (val1 == val2):
6888 */
6889 if ( uMantissa1 == uMantissa2
6890 && iExponent1 == iExponent2
6891 && ( fSign1 == fSign2
6892 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6893 fFsw |= X86_FSW_C3;
6894 /*
6895 * Test if less than (val1 < val2):
6896 */
6897 else if (fSign1 && !fSign2)
6898 fFsw |= X86_FSW_C0;
6899 else if (fSign1 == fSign2)
6900 {
6901 /* Zeros are problematic, however at the most one can be zero here. */
6902 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6903 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6904 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6905 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6906
6907 if ( fSign1
6908 ^ ( iExponent1 < iExponent2
6909 || ( iExponent1 == iExponent2
6910 && uMantissa1 < uMantissa2 ) ) )
6911 fFsw |= X86_FSW_C0;
6912 }
6913 /* else: No flags set if greater. */
6914
6915 return fFsw;
6916}
6917
6918
6919IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6920 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6921{
6922 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6923}
6924
6925
6926
6927
6928IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6929 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6930{
6931 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6932}
6933
6934
6935IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6936 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6937{
6938 RTFLOAT80U r80Val2;
6939 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6940 Assert(!fFsw || fFsw == X86_FSW_DE);
6941 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6942 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6943 {
6944 if (!(pFpuState->FCW & X86_FCW_DM))
6945 fFsw |= X86_FSW_ES | X86_FSW_B;
6946 *pfFsw |= fFsw;
6947 }
6948}
6949
6950
6951IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6952 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6953{
6954 RTFLOAT80U r80Val2;
6955 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6956 Assert(!fFsw || fFsw == X86_FSW_DE);
6957 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6958 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6959 {
6960 if (!(pFpuState->FCW & X86_FCW_DM))
6961 fFsw |= X86_FSW_ES | X86_FSW_B;
6962 *pfFsw |= fFsw;
6963 }
6964}
6965
6966
6967IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6968 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6969{
6970 RTFLOAT80U r80Val2;
6971 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6972 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6973}
6974
6975
6976IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6977 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6978{
6979 RTFLOAT80U r80Val2;
6980 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6981 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6982}
6983
6984
6985/**
6986 * Worker for fcomi & fucomi.
6987 */
6988static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6989 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6990{
6991 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6992 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6993 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6994 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6995
6996 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6997 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6998 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6999}
7000
7001
7002IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7003 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7004{
7005 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7006}
7007
7008
7009IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7010 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7011{
7012 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7013}
7014
7015
7016/*********************************************************************************************************************************
7017* x87 FPU Other Operations *
7018*********************************************************************************************************************************/
7019
7020/**
7021 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7022 */
7023static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7024{
7025 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7026 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7027 true /*exact / generate #PE */, &SoftState));
7028 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7029}
7030
7031
7032IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7033{
7034 uint16_t const fFcw = pFpuState->FCW;
7035 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7036
7037 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7038 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7039 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7040 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7041 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7042 || RTFLOAT80U_IS_INF(pr80Val))
7043 pFpuRes->r80Result = *pr80Val;
7044 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7045 {
7046 fFsw |= X86_FSW_DE;
7047 if (fFcw & X86_FCW_DM)
7048 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7049 else
7050 {
7051 pFpuRes->r80Result = *pr80Val;
7052 fFsw |= X86_FSW_ES | X86_FSW_B;
7053 }
7054 }
7055 else
7056 {
7057 if (fFcw & X86_FCW_IM)
7058 {
7059 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7060 pFpuRes->r80Result = g_r80Indefinite;
7061 else
7062 {
7063 pFpuRes->r80Result = *pr80Val;
7064 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7065 }
7066 }
7067 else
7068 {
7069 pFpuRes->r80Result = *pr80Val;
7070 fFsw |= X86_FSW_ES | X86_FSW_B;
7071 }
7072 fFsw |= X86_FSW_IE;
7073 }
7074 pFpuRes->FSW = fFsw;
7075}
7076
7077
7078IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7079 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7080{
7081 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7082 it does everything we need it to do. */
7083 uint16_t const fFcw = pFpuState->FCW;
7084 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7085 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7086 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7087 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7088}
7089
7090
7091/**
7092 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7093 */
7094static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7095{
7096 Assert(!pr80Val->s.fSign);
7097 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7098 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7099 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7100}
7101
7102
7103IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7104{
7105 uint16_t const fFcw = pFpuState->FCW;
7106 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7107
7108 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7109 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7110 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7111 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7112 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7113 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7114 pFpuRes->r80Result = *pr80Val;
7115 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7116 {
7117 fFsw |= X86_FSW_DE;
7118 if (fFcw & X86_FCW_DM)
7119 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7120 else
7121 {
7122 pFpuRes->r80Result = *pr80Val;
7123 fFsw |= X86_FSW_ES | X86_FSW_B;
7124 }
7125 }
7126 else
7127 {
7128 if (fFcw & X86_FCW_IM)
7129 {
7130 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7131 pFpuRes->r80Result = g_r80Indefinite;
7132 else
7133 {
7134 pFpuRes->r80Result = *pr80Val;
7135 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7136 }
7137 }
7138 else
7139 {
7140 pFpuRes->r80Result = *pr80Val;
7141 fFsw |= X86_FSW_ES | X86_FSW_B;
7142 }
7143 fFsw |= X86_FSW_IE;
7144 }
7145 pFpuRes->FSW = fFsw;
7146}
7147
7148
7149/**
7150 * @code{.unparsed}
7151 * x x * ln2
7152 * f(x) = 2 - 1 = e - 1
7153 *
7154 * @endcode
7155 *
7156 * We can approximate e^x by a Taylor/Maclaurin series (see
7157 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7158 * @code{.unparsed}
7159 * n 0 1 2 3 4
7160 * inf x x x x x x
7161 * SUM ----- = --- + --- + --- + --- + --- + ...
7162 * n=0 n! 0! 1! 2! 3! 4!
7163 *
7164 * 2 3 4
7165 * x x x
7166 * = 1 + x + --- + --- + --- + ...
7167 * 2! 3! 4!
7168 * @endcode
7169 *
7170 * Given z = x * ln2, we get:
7171 * @code{.unparsed}
7172 * 2 3 4 n
7173 * z z z z z
7174 * e - 1 = z + --- + --- + --- + ... + ---
7175 * 2! 3! 4! n!
7176 * @endcode
7177 *
7178 * Wanting to use Horner's method, we move one z outside and get:
7179 * @code{.unparsed}
7180 * 2 3 (n-1)
7181 * z z z z
7182 * = z ( 1 + --- + --- + --- + ... + ------- )
7183 * 2! 3! 4! n!
7184 * @endcode
7185 *
7186 * The constants we need for using Horner's methods are 1 and 1 / n!.
7187 *
7188 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7189 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7190 * and can approximate it to be 1.0. For a visual demonstration of this
7191 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7192 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7193 *
7194 *
7195 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7196 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7197 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7198 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7199 * blocks). (The one bit difference is probably an implicit one missing from
7200 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7201 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7202 * exponent.
7203 *
7204 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7205 * successfully reproduced the exact results from an Intel 10980XE, there is
7206 * always a portition of rounding differences. Not going to spend too much time
7207 * on getting this 100% the same, at least not now.
7208 *
7209 * P.S. If someone are really curious about 8087 and its contstants:
7210 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7211 *
7212 *
7213 * @param pr80Val The exponent value (x), less than 1.0, greater than
7214 * -1.0 and not zero. This can be a normal, denormal
7215 * or pseudo-denormal value.
7216 * @param pr80Result Where to return the result.
7217 * @param fFcw FPU control word.
7218 * @param fFsw FPU status word.
7219 */
7220static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7221{
7222 /* As mentioned above, we can skip the expensive polynomial calculation
7223 as it will be close enough to 1.0 that it makes no difference.
7224
7225 The cutoff point for intel 10980XE is exponents >= -69. Intel
7226 also seems to be using a 67-bit or 68-bit constant value, and we get
7227 a smattering of rounding differences if we go for higher precision. */
7228 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7229 {
7230 RTUINT256U u256;
7231 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7232 u256.QWords.qw0 |= 1; /* force #PE */
7233 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7234 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7235 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7236 : 1 - RTFLOAT80U_EXP_BIAS,
7237 fFcw, fFsw);
7238 }
7239 else
7240 {
7241#ifdef IEM_WITH_FLOAT128_FOR_FPU
7242 /* This approach is not good enough for small values, we end up with zero. */
7243 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7244 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7245 _Float128 rd128Result = powf128(2.0L, rd128Val);
7246 rd128Result -= 1.0L;
7247 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7248 iemFpuF128RestoreRounding(fOldRounding);
7249
7250# else
7251 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7252 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7253
7254 /* As mentioned above, enforce 68-bit internal mantissa width to better
7255 match the Intel 10980XE results. */
7256 unsigned const cPrecision = 68;
7257
7258 /* first calculate z = x * ln2 */
7259 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7260 cPrecision);
7261
7262 /* Then do the polynomial evaluation. */
7263 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7264 cPrecision, &SoftState);
7265 r = f128_mul(z, r, &SoftState);
7266
7267 /* Output the result. */
7268 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7269# endif
7270 }
7271 return fFsw;
7272}
7273
7274
7275IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7276{
7277 uint16_t const fFcw = pFpuState->FCW;
7278 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7279
7280 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7281 {
7282 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7283 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7284 else
7285 {
7286 /* Special case:
7287 2^+1.0 - 1.0 = 1.0
7288 2^-1.0 - 1.0 = -0.5 */
7289 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7290 && pr80Val->s.uMantissa == RT_BIT_64(63))
7291 {
7292 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7293 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7294 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7295 }
7296 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7297 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7298 else
7299 pFpuRes->r80Result = *pr80Val;
7300 fFsw |= X86_FSW_PE;
7301 if (!(fFcw & X86_FCW_PM))
7302 fFsw |= X86_FSW_ES | X86_FSW_B;
7303 }
7304 }
7305 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7306 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7307 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7308 pFpuRes->r80Result = *pr80Val;
7309 else if (RTFLOAT80U_IS_INF(pr80Val))
7310 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7311 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7312 {
7313 fFsw |= X86_FSW_DE;
7314 if (fFcw & X86_FCW_DM)
7315 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7316 else
7317 {
7318 pFpuRes->r80Result = *pr80Val;
7319 fFsw |= X86_FSW_ES | X86_FSW_B;
7320 }
7321 }
7322 else
7323 {
7324 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7325 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7326 && (fFcw & X86_FCW_IM))
7327 pFpuRes->r80Result = g_r80Indefinite;
7328 else
7329 {
7330 pFpuRes->r80Result = *pr80Val;
7331 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7332 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7333 }
7334 fFsw |= X86_FSW_IE;
7335 if (!(fFcw & X86_FCW_IM))
7336 fFsw |= X86_FSW_ES | X86_FSW_B;
7337 }
7338 pFpuRes->FSW = fFsw;
7339}
7340
7341#endif /* IEM_WITHOUT_ASSEMBLY */
7342
7343IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7344{
7345 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7346}
7347
7348IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7349{
7350 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7351}
7352
7353#ifdef IEM_WITHOUT_ASSEMBLY
7354
7355IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7356{
7357 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7358 pFpuRes->r80Result = *pr80Val;
7359 pFpuRes->r80Result.s.fSign = 0;
7360}
7361
7362
7363IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7364{
7365 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7366 pFpuRes->r80Result = *pr80Val;
7367 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7368}
7369
7370
7371IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7372{
7373 uint16_t const fFcw = pFpuState->FCW;
7374 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7375
7376 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7377 {
7378 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7379 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7380
7381 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7382 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7383 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7384 }
7385 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7386 {
7387 fFsw |= X86_FSW_ZE;
7388 if (fFcw & X86_FCW_ZM)
7389 {
7390 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7391 pFpuResTwo->r80Result2 = *pr80Val;
7392 }
7393 else
7394 {
7395 pFpuResTwo->r80Result2 = *pr80Val;
7396 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7397 }
7398 }
7399 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7400 {
7401 fFsw |= X86_FSW_DE;
7402 if (fFcw & X86_FCW_DM)
7403 {
7404 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7405 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7406 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7407 int32_t iExponent = -16382;
7408 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7409 {
7410 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7411 iExponent--;
7412 }
7413
7414 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7415 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7416 }
7417 else
7418 {
7419 pFpuResTwo->r80Result2 = *pr80Val;
7420 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7421 }
7422 }
7423 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7424 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7425 {
7426 pFpuResTwo->r80Result1 = *pr80Val;
7427 pFpuResTwo->r80Result2 = *pr80Val;
7428 }
7429 else if (RTFLOAT80U_IS_INF(pr80Val))
7430 {
7431 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7432 pFpuResTwo->r80Result2 = *pr80Val;
7433 }
7434 else
7435 {
7436 if (fFcw & X86_FCW_IM)
7437 {
7438 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7439 pFpuResTwo->r80Result1 = g_r80Indefinite;
7440 else
7441 {
7442 pFpuResTwo->r80Result1 = *pr80Val;
7443 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7444 }
7445 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7446 }
7447 else
7448 {
7449 pFpuResTwo->r80Result2 = *pr80Val;
7450 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7451 }
7452 fFsw |= X86_FSW_IE;
7453 }
7454 pFpuResTwo->FSW = fFsw;
7455}
7456#endif /* IEM_WITHOUT_ASSEMBLY */
7457
7458#if defined(IEM_WITHOUT_ASSEMBLY)
7459
7460static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7461{
7462 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7463 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7464 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7465 extFloat80_t v;
7466 (void)fFcw;
7467
7468 v = extF80_ylog2x(y, x, &SoftState);
7469 iemFpuSoftF80ToIprt(pr80Result, v);
7470
7471 return fFsw;
7472}
7473
7474IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7475 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7476{
7477 uint16_t const fFcw = pFpuState->FCW;
7478 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7479
7480 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7481 {
7482 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7483
7484 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7485 if (!(fFcw & X86_FCW_PM))
7486 fFsw |= X86_FSW_ES | X86_FSW_B;
7487 }
7488 else
7489 {
7490 fFsw |= X86_FSW_IE;
7491
7492 if (!(fFcw & X86_FCW_IM))
7493 {
7494 pFpuRes->r80Result = *pr80Val2;
7495 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7496 }
7497 else
7498 {
7499 pFpuRes->r80Result = g_r80Indefinite;
7500 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7501 }
7502 }
7503
7504 pFpuRes->FSW = fFsw;
7505}
7506#endif /* IEM_WITHOUT_ASSEMBLY */
7507
7508IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7509 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7510{
7511 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7512}
7513
7514IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7515 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7516{
7517 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7518}
7519
7520#if defined(IEM_WITHOUT_ASSEMBLY)
7521
7522static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7523{
7524 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7525 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7526 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7527 extFloat80_t v;
7528 (void)fFcw;
7529
7530 v = extF80_ylog2xp1(y, x, &SoftState);
7531 iemFpuSoftF80ToIprt(pr80Result, v);
7532
7533 return fFsw;
7534}
7535
7536IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7537 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7538{
7539 uint16_t const fFcw = pFpuState->FCW;
7540 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7541
7542 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7543 {
7544 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7545
7546 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7547 if (!(fFcw & X86_FCW_PM))
7548 fFsw |= X86_FSW_ES | X86_FSW_B;
7549 }
7550 else
7551 {
7552 fFsw |= X86_FSW_IE;
7553
7554 if (!(fFcw & X86_FCW_IM))
7555 {
7556 pFpuRes->r80Result = *pr80Val2;
7557 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7558 }
7559 else
7560 {
7561 pFpuRes->r80Result = g_r80Indefinite;
7562 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7563 }
7564 }
7565
7566 pFpuRes->FSW = fFsw;
7567}
7568
7569#endif /* IEM_WITHOUT_ASSEMBLY */
7570
7571IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7572 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7573{
7574 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7575}
7576
7577IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7578 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7579{
7580 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7581}
7582
7583
7584/*********************************************************************************************************************************
7585* MMX, SSE & AVX *
7586*********************************************************************************************************************************/
7587
7588#ifdef IEM_WITH_VEX
7589
7590/*
7591 * VMOVSLDUP
7592 */
7593IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7594{
7595 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7596 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7597 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7598 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7599 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7600 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7601 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7602 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7603}
7604
7605
7606IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7607{
7608 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7609 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7610 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7611 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7612 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7613 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7614 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7615 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7616}
7617
7618#endif /* IEM_WITH_VEX */
7619
7620
7621#ifdef IEM_WITH_VEX
7622
7623/*
7624 * VMOVSHDUP
7625 */
7626IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7627{
7628 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7629 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7630 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7631 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7632 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7633 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7634 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7635 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7636}
7637
7638
7639IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7640{
7641 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7642 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7643 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7644 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7645 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7646 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7647 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7648 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7649}
7650
7651#endif /* IEM_WITH_VEX */
7652
7653
7654#ifdef IEM_WITH_VEX
7655
7656/*
7657 * VMOVDDUP
7658 */
7659IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7660{
7661 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7662 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7663 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7664 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7665}
7666
7667IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7668{
7669 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7670 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7671 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7672 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7673}
7674
7675#endif /* IEM_WITH_VEX */
7676
7677
7678/*
7679 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7680 */
7681#ifdef IEM_WITHOUT_ASSEMBLY
7682
7683IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7684{
7685 RT_NOREF(pFpuState);
7686 *puDst &= *puSrc;
7687}
7688
7689
7690IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7691{
7692 RT_NOREF(pFpuState);
7693 puDst->au64[0] &= puSrc->au64[0];
7694 puDst->au64[1] &= puSrc->au64[1];
7695}
7696
7697#endif
7698
7699IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7700 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7701{
7702 RT_NOREF(pExtState);
7703 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7704 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7705}
7706
7707
7708IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7709 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7710{
7711 RT_NOREF(pExtState);
7712 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7713 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7714 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7715 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7716}
7717
7718
7719/*
7720 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7721 */
7722#ifdef IEM_WITHOUT_ASSEMBLY
7723
7724IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7725{
7726 RT_NOREF(pFpuState);
7727 *puDst = ~*puDst & *puSrc;
7728}
7729
7730
7731IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7732{
7733 RT_NOREF(pFpuState);
7734 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7735 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7736}
7737
7738#endif
7739
7740IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7741 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7742{
7743 RT_NOREF(pExtState);
7744 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7745 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7746}
7747
7748
7749IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7750 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7751{
7752 RT_NOREF(pExtState);
7753 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7754 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7755 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7756 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7757}
7758
7759
7760/*
7761 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7762 */
7763#ifdef IEM_WITHOUT_ASSEMBLY
7764
7765IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7766{
7767 RT_NOREF(pFpuState);
7768 *puDst |= *puSrc;
7769}
7770
7771
7772IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7773{
7774 RT_NOREF(pFpuState);
7775 puDst->au64[0] |= puSrc->au64[0];
7776 puDst->au64[1] |= puSrc->au64[1];
7777}
7778
7779#endif
7780
7781IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7782 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7783{
7784 RT_NOREF(pExtState);
7785 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7786 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7787}
7788
7789
7790IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7791 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7792{
7793 RT_NOREF(pExtState);
7794 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7795 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7796 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7797 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7798}
7799
7800
7801/*
7802 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7803 */
7804#ifdef IEM_WITHOUT_ASSEMBLY
7805
7806IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7807{
7808 RT_NOREF(pFpuState);
7809 *puDst ^= *puSrc;
7810}
7811
7812
7813IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7814{
7815 RT_NOREF(pFpuState);
7816 puDst->au64[0] ^= puSrc->au64[0];
7817 puDst->au64[1] ^= puSrc->au64[1];
7818}
7819
7820#endif
7821
7822IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7823 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7824{
7825 RT_NOREF(pExtState);
7826 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7827 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7828}
7829
7830
7831IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7832 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7833{
7834 RT_NOREF(pExtState);
7835 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7836 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7837 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7838 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7839}
7840
7841
7842/*
7843 * PCMPEQB / VPCMPEQB
7844 */
7845#ifdef IEM_WITHOUT_ASSEMBLY
7846
7847IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7848{
7849 RT_NOREF(pFpuState);
7850 RTUINT64U uSrc1 = { *puDst };
7851 RTUINT64U uSrc2 = { *puSrc };
7852 RTUINT64U uDst;
7853 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7854 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7855 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7856 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7857 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7858 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7859 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7860 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7861 *puDst = uDst.u;
7862}
7863
7864
7865IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7866{
7867 RT_NOREF(pFpuState);
7868 RTUINT128U uSrc1 = *puDst;
7869 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7870 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7871 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7872 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7873 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7874 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7875 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7876 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7877 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7878 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7879 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7880 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7881 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7882 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7883 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7884 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7885}
7886
7887#endif
7888
7889IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7890 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7891{
7892 RT_NOREF(pExtState);
7893 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7894 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7895 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7896 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7897 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7898 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7899 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7900 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7901 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7902 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7903 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7904 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7905 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7906 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7907 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7908 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7909}
7910
7911IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7912 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7913{
7914 RT_NOREF(pExtState);
7915 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7916 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7917 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7918 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7919 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7920 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7921 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7922 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7923 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7924 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7925 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7926 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7927 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7928 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7929 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7930 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7931 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7932 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7933 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7934 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7935 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7936 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7937 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7938 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7939 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7940 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7941 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7942 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7943 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7944 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7945 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7946 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7947}
7948
7949
7950/*
7951 * PCMPEQW / VPCMPEQW
7952 */
7953#ifdef IEM_WITHOUT_ASSEMBLY
7954
7955IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7956{
7957 RT_NOREF(pFpuState);
7958 RTUINT64U uSrc1 = { *puDst };
7959 RTUINT64U uSrc2 = { *puSrc };
7960 RTUINT64U uDst;
7961 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7962 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7963 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7964 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7965 *puDst = uDst.u;
7966}
7967
7968
7969IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7970{
7971 RT_NOREF(pFpuState);
7972 RTUINT128U uSrc1 = *puDst;
7973 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7974 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7975 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7976 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7977 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7978 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7979 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7980 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7981}
7982
7983#endif
7984
7985IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7986 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7987{
7988 RT_NOREF(pExtState);
7989 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7990 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7991 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7992 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7993 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7994 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7995 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7996 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7997}
7998
7999IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8000 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8001{
8002 RT_NOREF(pExtState);
8003 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8004 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8005 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8006 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8007 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8008 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8009 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8010 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8011 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8012 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8013 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8014 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8015 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8016 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8017 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8018 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8019}
8020
8021
8022/*
8023 * PCMPEQD / VPCMPEQD.
8024 */
8025#ifdef IEM_WITHOUT_ASSEMBLY
8026
8027IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8028{
8029 RT_NOREF(pFpuState);
8030 RTUINT64U uSrc1 = { *puDst };
8031 RTUINT64U uSrc2 = { *puSrc };
8032 RTUINT64U uDst;
8033 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8034 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8035 *puDst = uDst.u;
8036}
8037
8038
8039IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8040{
8041 RT_NOREF(pFpuState);
8042 RTUINT128U uSrc1 = *puDst;
8043 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8044 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8045 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8046 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8047}
8048
8049#endif /* IEM_WITHOUT_ASSEMBLY */
8050
8051IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8052 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8053{
8054 RT_NOREF(pExtState);
8055 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8056 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8057 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8058 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8059}
8060
8061IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8062 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8063{
8064 RT_NOREF(pExtState);
8065 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8066 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8067 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8068 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8069 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8070 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8071 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8072 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8073}
8074
8075
8076/*
8077 * PCMPEQQ / VPCMPEQQ.
8078 */
8079IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8080{
8081 RT_NOREF(pFpuState);
8082 RTUINT128U uSrc1 = *puDst;
8083 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8084 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8085}
8086
8087IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8088 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8089{
8090 RT_NOREF(pExtState);
8091 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8092 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8093}
8094
8095IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8096 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8097{
8098 RT_NOREF(pExtState);
8099 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8100 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8101 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8102 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8103}
8104
8105
8106/*
8107 * PCMPGTB / VPCMPGTB
8108 */
8109#ifdef IEM_WITHOUT_ASSEMBLY
8110
8111IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8112{
8113 RT_NOREF(pFpuState);
8114 RTUINT64U uSrc1 = { *puDst };
8115 RTUINT64U uSrc2 = { *puSrc };
8116 RTUINT64U uDst;
8117 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8118 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8119 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8120 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8121 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8122 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8123 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8124 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8125 *puDst = uDst.u;
8126}
8127
8128
8129IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8130{
8131 RT_NOREF(pFpuState);
8132 RTUINT128U uSrc1 = *puDst;
8133 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8134 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8135 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8136 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8137 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8138 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8139 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8140 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8141 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8142 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8143 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8144 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8145 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8146 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8147 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8148 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8149}
8150
8151#endif
8152
8153IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8154 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8155{
8156 RT_NOREF(pExtState);
8157 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8158 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8159 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8160 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8161 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8162 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8163 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8164 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8165 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8166 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8167 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8168 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8169 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8170 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8171 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8172 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8173}
8174
8175IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8176 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8177{
8178 RT_NOREF(pExtState);
8179 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8180 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8181 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8182 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8183 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8184 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8185 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8186 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8187 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8188 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8189 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8190 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8191 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8192 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8193 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8194 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8195 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8196 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8197 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8198 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8199 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8200 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8201 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8202 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8203 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8204 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8205 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8206 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8207 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8208 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8209 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8210 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8211}
8212
8213
8214/*
8215 * PCMPGTW / VPCMPGTW
8216 */
8217#ifdef IEM_WITHOUT_ASSEMBLY
8218
8219IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8220{
8221 RT_NOREF(pFpuState);
8222 RTUINT64U uSrc1 = { *puDst };
8223 RTUINT64U uSrc2 = { *puSrc };
8224 RTUINT64U uDst;
8225 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8226 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8227 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8228 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8229 *puDst = uDst.u;
8230}
8231
8232
8233IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8234{
8235 RT_NOREF(pFpuState);
8236 RTUINT128U uSrc1 = *puDst;
8237 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8238 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8239 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8240 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8241 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8242 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8243 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8244 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8245}
8246
8247#endif
8248
8249IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8250 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8251{
8252 RT_NOREF(pExtState);
8253 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8254 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8255 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8256 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8257 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8258 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8259 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8260 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8261}
8262
8263IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8264 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8265{
8266 RT_NOREF(pExtState);
8267 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8268 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8269 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8270 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8271 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8272 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8273 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8274 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8275 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8276 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8277 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8278 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8279 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8280 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8281 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8282 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8283}
8284
8285
8286/*
8287 * PCMPGTD / VPCMPGTD.
8288 */
8289#ifdef IEM_WITHOUT_ASSEMBLY
8290
8291IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8292{
8293 RT_NOREF(pFpuState);
8294 RTUINT64U uSrc1 = { *puDst };
8295 RTUINT64U uSrc2 = { *puSrc };
8296 RTUINT64U uDst;
8297 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8298 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8299 *puDst = uDst.u;
8300}
8301
8302
8303IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8304{
8305 RT_NOREF(pFpuState);
8306 RTUINT128U uSrc1 = *puDst;
8307 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8308 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8309 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8310 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8311}
8312
8313#endif /* IEM_WITHOUT_ASSEMBLY */
8314
8315IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8316 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8317{
8318 RT_NOREF(pExtState);
8319 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8320 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8321 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8322 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8323}
8324
8325IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8326 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8327{
8328 RT_NOREF(pExtState);
8329 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8330 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8331 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8332 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8333 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8334 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8335 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8336 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8337}
8338
8339
8340/*
8341 * PCMPGTQ / VPCMPGTQ.
8342 */
8343IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8344{
8345 RT_NOREF(pFpuState);
8346 RTUINT128U uSrc1 = *puDst;
8347 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8348 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8349}
8350
8351IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8352 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8353{
8354 RT_NOREF(pExtState);
8355 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8356 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8357}
8358
8359IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8360 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8361{
8362 RT_NOREF(pExtState);
8363 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8364 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8365 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8366 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8367}
8368
8369
8370/*
8371 * PADDB / VPADDB
8372 */
8373#ifdef IEM_WITHOUT_ASSEMBLY
8374
8375IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8376{
8377 RT_NOREF(pFpuState);
8378 RTUINT64U uSrc1 = { *puDst };
8379 RTUINT64U uSrc2 = { *puSrc };
8380 RTUINT64U uDst;
8381 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8382 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8383 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8384 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8385 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8386 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8387 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8388 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8389 *puDst = uDst.u;
8390}
8391
8392
8393IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8394{
8395 RT_NOREF(pFpuState);
8396 RTUINT128U uSrc1 = *puDst;
8397 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8398 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8399 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8400 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8401 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8402 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8403 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8404 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8405 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8406 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8407 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8408 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8409 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8410 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8411 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8412 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8413}
8414
8415#endif
8416
8417
8418IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8419 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8420{
8421 RT_NOREF(pExtState);
8422 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8423 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8424 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8425 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8426 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8427 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8428 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8429 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8430 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8431 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8432 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8433 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8434 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8435 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8436 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8437 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8438}
8439
8440IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8441 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8442{
8443 RT_NOREF(pExtState);
8444 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8445 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8446 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8447 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8448 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8449 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8450 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8451 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8452 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8453 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8454 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8455 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8456 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8457 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8458 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8459 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8460 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8461 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8462 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8463 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8464 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8465 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8466 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8467 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8468 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8469 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8470 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8471 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8472 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8473 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8474 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8475 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8476}
8477
8478
8479/*
8480 * PADDSB / VPADDSB
8481 */
8482#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8483 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8484 ? (uint8_t)(a_iWord) \
8485 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8486
8487#ifdef IEM_WITHOUT_ASSEMBLY
8488
8489IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8490{
8491 RT_NOREF(pFpuState);
8492 RTUINT64U uSrc1 = { *puDst };
8493 RTUINT64U uSrc2 = { *puSrc };
8494 RTUINT64U uDst;
8495 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8496 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8497 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8498 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8499 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8500 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8501 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8502 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8503 *puDst = uDst.u;
8504}
8505
8506
8507IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8508{
8509 RT_NOREF(pFpuState);
8510 RTUINT128U uSrc1 = *puDst;
8511 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8512 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8513 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8514 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8515 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8516 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8517 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8518 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8519 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8520 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8521 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8522 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8523 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8524 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8525 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8526 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8527}
8528
8529#endif
8530
8531IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8532 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8533{
8534 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8535 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8536 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8537 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8538 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8539 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8540 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8541 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8542 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8543 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8544 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8545 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8546 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8547 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8548 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8549 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8550}
8551
8552IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8553 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8554{
8555 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8556 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8557 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8558 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8559 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8560 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8561 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8562 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8563 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8564 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8565 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8566 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8567 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8568 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8569 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8570 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8571 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8572 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8573 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8574 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8575 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8576 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8577 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8578 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8579 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8580 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8581 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8582 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8583 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8584 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8585 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8586 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8587}
8588
8589
8590/*
8591 * PADDUSB / VPADDUSB
8592 */
8593#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8594 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8595 ? (uint8_t)(a_uWord) \
8596 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8597
8598#ifdef IEM_WITHOUT_ASSEMBLY
8599
8600IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8601{
8602 RT_NOREF(pFpuState);
8603 RTUINT64U uSrc1 = { *puDst };
8604 RTUINT64U uSrc2 = { *puSrc };
8605 RTUINT64U uDst;
8606 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8607 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8608 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8609 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8610 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8611 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8612 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8613 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8614 *puDst = uDst.u;
8615}
8616
8617
8618IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8619{
8620 RT_NOREF(pFpuState);
8621 RTUINT128U uSrc1 = *puDst;
8622 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8623 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8624 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8625 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8626 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8627 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8628 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8629 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8630 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8631 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8632 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8633 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8634 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8635 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8636 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8637 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8638}
8639
8640#endif
8641
8642IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8643 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8644{
8645 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8646 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8647 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8648 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8649 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8650 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8651 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8652 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8653 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8654 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8655 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8656 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8657 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8658 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8659 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8660 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8661}
8662
8663IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8664 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8665{
8666 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8667 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8668 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8669 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8670 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8671 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8672 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8673 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8674 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8675 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8676 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8677 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8678 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8679 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8680 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8681 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8682 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8683 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8684 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8685 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8686 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8687 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8688 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8689 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8690 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8691 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8692 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8693 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8694 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8695 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8696 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8697 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8698}
8699
8700
8701/*
8702 * PADDW / VPADDW
8703 */
8704#ifdef IEM_WITHOUT_ASSEMBLY
8705
8706IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8707{
8708 RT_NOREF(pFpuState);
8709 RTUINT64U uSrc1 = { *puDst };
8710 RTUINT64U uSrc2 = { *puSrc };
8711 RTUINT64U uDst;
8712 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8713 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8714 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8715 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8716 *puDst = uDst.u;
8717}
8718
8719
8720IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8721{
8722 RT_NOREF(pFpuState);
8723 RTUINT128U uSrc1 = *puDst;
8724 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8725 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8726 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8727 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8728 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8729 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8730 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8731 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8732}
8733
8734#endif
8735
8736
8737IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8738 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8739{
8740 RT_NOREF(pExtState);
8741 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8742 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8743 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8744 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8745 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8746 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8747 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8748 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8749}
8750
8751IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8752 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8753{
8754 RT_NOREF(pExtState);
8755 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8756 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8757 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8758 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8759 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8760 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8761 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8762 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8763 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8764 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8765 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8766 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8767 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8768 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8769 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8770 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8771}
8772
8773
8774/*
8775 * PADDSW / VPADDSW
8776 */
8777#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8778 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8779 ? (uint16_t)(a_iDword) \
8780 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8781
8782#ifdef IEM_WITHOUT_ASSEMBLY
8783
8784IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8785{
8786 RT_NOREF(pFpuState);
8787 RTUINT64U uSrc1 = { *puDst };
8788 RTUINT64U uSrc2 = { *puSrc };
8789 RTUINT64U uDst;
8790 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8791 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8792 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8793 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8794 *puDst = uDst.u;
8795}
8796
8797
8798IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8799{
8800 RT_NOREF(pFpuState);
8801 RTUINT128U uSrc1 = *puDst;
8802 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8803 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8804 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8805 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8806 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8807 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8808 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8809 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8810}
8811
8812#endif
8813
8814IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8815 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8816{
8817 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8818 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8819 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8820 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8821 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8822 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8823 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8824 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8825}
8826
8827IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8828 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8829{
8830 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8831 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8832 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8833 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8834 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8835 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8836 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8837 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8838 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8839 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8840 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8841 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8842 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8843 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8844 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8845 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8846}
8847
8848
8849/*
8850 * PADDUSW / VPADDUSW
8851 */
8852#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8853 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8854 ? (uint16_t)(a_uDword) \
8855 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8856
8857#ifdef IEM_WITHOUT_ASSEMBLY
8858
8859IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8860{
8861 RT_NOREF(pFpuState);
8862 RTUINT64U uSrc1 = { *puDst };
8863 RTUINT64U uSrc2 = { *puSrc };
8864 RTUINT64U uDst;
8865 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8866 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8867 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8868 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8869 *puDst = uDst.u;
8870}
8871
8872
8873IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8874{
8875 RT_NOREF(pFpuState);
8876 RTUINT128U uSrc1 = *puDst;
8877 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8878 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8879 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8880 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8881 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8882 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8883 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8884 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8885}
8886
8887#endif
8888
8889IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
8890 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8891{
8892 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8893 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8894 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8895 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8896 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8897 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8898 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8899 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8900}
8901
8902IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
8903 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8904{
8905 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8906 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8907 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8908 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8909 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8910 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8911 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8912 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8913 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
8914 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
8915 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
8916 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
8917 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
8918 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
8919 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
8920 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
8921}
8922
8923
8924/*
8925 * PADDD / VPADDD.
8926 */
8927#ifdef IEM_WITHOUT_ASSEMBLY
8928
8929IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8930{
8931 RT_NOREF(pFpuState);
8932 RTUINT64U uSrc1 = { *puDst };
8933 RTUINT64U uSrc2 = { *puSrc };
8934 RTUINT64U uDst;
8935 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8936 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8937 *puDst = uDst.u;
8938}
8939
8940
8941IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8942{
8943 RT_NOREF(pFpuState);
8944 RTUINT128U uSrc1 = *puDst;
8945 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8946 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8947 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8948 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8949}
8950
8951#endif /* IEM_WITHOUT_ASSEMBLY */
8952
8953IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8954 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8955{
8956 RT_NOREF(pExtState);
8957 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8958 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8959 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8960 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8961}
8962
8963IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8964 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8965{
8966 RT_NOREF(pExtState);
8967 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8968 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8969 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8970 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8971 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8972 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8973 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8974 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8975}
8976
8977
8978/*
8979 * PADDQ / VPADDQ.
8980 */
8981#ifdef IEM_WITHOUT_ASSEMBLY
8982
8983IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8984{
8985 RT_NOREF(pFpuState);
8986 *puDst = *puDst + *puSrc;
8987}
8988
8989IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8990{
8991 RT_NOREF(pFpuState);
8992 RTUINT128U uSrc1 = *puDst;
8993 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8994 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8995}
8996
8997#endif
8998
8999IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9000 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9001{
9002 RT_NOREF(pExtState);
9003 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9004 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9005}
9006
9007IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9008 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9009{
9010 RT_NOREF(pExtState);
9011 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9012 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9013 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9014 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9015}
9016
9017
9018/*
9019 * PSUBB / VPSUBB
9020 */
9021#ifdef IEM_WITHOUT_ASSEMBLY
9022
9023IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9024{
9025 RT_NOREF(pFpuState);
9026 RTUINT64U uSrc1 = { *puDst };
9027 RTUINT64U uSrc2 = { *puSrc };
9028 RTUINT64U uDst;
9029 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9030 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9031 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9032 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9033 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9034 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9035 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9036 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9037 *puDst = uDst.u;
9038}
9039
9040
9041IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9042{
9043 RT_NOREF(pFpuState);
9044 RTUINT128U uSrc1 = *puDst;
9045 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9046 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9047 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9048 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9049 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9050 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9051 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9052 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9053 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9054 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9055 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9056 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9057 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9058 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9059 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9060 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9061}
9062
9063#endif
9064
9065IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9066 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9067{
9068 RT_NOREF(pExtState);
9069 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9070 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9071 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9072 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9073 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9074 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9075 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9076 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9077 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9078 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9079 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9080 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9081 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9082 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9083 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9084 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9085}
9086
9087IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9088 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9089{
9090 RT_NOREF(pExtState);
9091 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9092 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9093 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9094 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9095 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9096 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9097 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9098 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9099 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9100 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9101 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9102 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9103 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9104 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9105 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9106 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9107 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9108 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9109 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9110 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9111 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9112 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9113 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9114 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9115 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9116 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9117 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9118 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9119 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9120 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9121 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9122 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9123}
9124
9125
9126/*
9127 * PSUBSB / VSUBSB
9128 */
9129#ifdef IEM_WITHOUT_ASSEMBLY
9130
9131IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9132{
9133 RT_NOREF(pFpuState);
9134 RTUINT64U uSrc1 = { *puDst };
9135 RTUINT64U uSrc2 = { *puSrc };
9136 RTUINT64U uDst;
9137 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9138 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9139 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9140 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9141 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9142 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9143 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9144 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9145 *puDst = uDst.u;
9146}
9147
9148
9149IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9150{
9151 RT_NOREF(pFpuState);
9152 RTUINT128U uSrc1 = *puDst;
9153 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9154 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9155 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9156 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9157 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9158 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9159 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9160 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9161 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9162 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9163 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9164 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9165 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9166 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9167 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9168 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9169}
9170
9171#endif
9172
9173IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9174 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9175{
9176 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9177 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9178 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9179 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9180 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9181 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9182 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9183 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9184 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9185 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9186 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9187 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9188 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9189 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9190 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9191 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9192}
9193
9194IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9195 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9196{
9197 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9198 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9199 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9200 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9201 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9202 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9203 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9204 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9205 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9206 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9207 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9208 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9209 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9210 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9211 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9212 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9213 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9214 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9215 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9216 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9217 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9218 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9219 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9220 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9221 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9222 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9223 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9224 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9225 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9226 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9227 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9228 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9229}
9230
9231
9232/*
9233 * PSUBUSB / VPSUBUSW
9234 */
9235#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9236 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9237 ? (uint8_t)(a_uWord) \
9238 : (uint8_t)0 )
9239
9240#ifdef IEM_WITHOUT_ASSEMBLY
9241
9242IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9243{
9244 RT_NOREF(pFpuState);
9245 RTUINT64U uSrc1 = { *puDst };
9246 RTUINT64U uSrc2 = { *puSrc };
9247 RTUINT64U uDst;
9248 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9249 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9250 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9251 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9252 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9253 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9254 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9255 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9256 *puDst = uDst.u;
9257}
9258
9259
9260IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9261{
9262 RT_NOREF(pFpuState);
9263 RTUINT128U uSrc1 = *puDst;
9264 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9265 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9266 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9267 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9268 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9269 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9270 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9271 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9272 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9273 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9274 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9275 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9276 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9277 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9278 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9279 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9280}
9281
9282#endif
9283
9284IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9285 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9286{
9287 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9288 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9289 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9290 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9291 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9292 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9293 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9294 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9295 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9296 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9297 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9298 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9299 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9300 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9301 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9302 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9303}
9304
9305IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9306 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9307{
9308 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9309 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9310 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9311 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9312 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9313 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9314 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9315 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9316 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9317 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9318 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9319 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9320 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9321 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9322 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9323 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9324 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9325 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9326 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9327 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9328 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9329 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9330 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9331 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9332 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9333 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9334 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9335 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9336 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9337 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9338 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9339 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9340}
9341
9342
9343/*
9344 * PSUBW / VPSUBW
9345 */
9346#ifdef IEM_WITHOUT_ASSEMBLY
9347
9348IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9349{
9350 RT_NOREF(pFpuState);
9351 RTUINT64U uSrc1 = { *puDst };
9352 RTUINT64U uSrc2 = { *puSrc };
9353 RTUINT64U uDst;
9354 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9355 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9356 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9357 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9358 *puDst = uDst.u;
9359}
9360
9361
9362IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9363{
9364 RT_NOREF(pFpuState);
9365 RTUINT128U uSrc1 = *puDst;
9366 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9367 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9368 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9369 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9370 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9371 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9372 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9373 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9374}
9375
9376#endif
9377
9378IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9379 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9380{
9381 RT_NOREF(pExtState);
9382 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9383 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9384 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9385 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9386 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9387 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9388 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9389 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9390}
9391
9392IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9393 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9394{
9395 RT_NOREF(pExtState);
9396 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9397 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9398 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9399 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9400 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9401 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9402 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9403 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9404 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9405 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9406 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9407 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9408 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9409 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9410 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9411 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9412}
9413
9414
9415/*
9416 * PSUBSW / VPSUBSW
9417 */
9418#ifdef IEM_WITHOUT_ASSEMBLY
9419
9420IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9421{
9422 RT_NOREF(pFpuState);
9423 RTUINT64U uSrc1 = { *puDst };
9424 RTUINT64U uSrc2 = { *puSrc };
9425 RTUINT64U uDst;
9426 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9427 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9428 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9429 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9430 *puDst = uDst.u;
9431}
9432
9433
9434IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9435{
9436 RT_NOREF(pFpuState);
9437 RTUINT128U uSrc1 = *puDst;
9438 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9439 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9440 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9441 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9442 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9443 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9444 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9445 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9446}
9447
9448#endif
9449
9450IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9451 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9452{
9453 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9454 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9455 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9456 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9457 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9458 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9459 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9460 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9461}
9462
9463IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9464 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9465{
9466 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9467 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9468 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9469 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9470 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9471 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9472 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9473 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9474 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9475 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9476 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9477 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9478 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9479 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9480 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9481 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9482}
9483
9484
9485/*
9486 * PSUBUSW / VPSUBUSW
9487 */
9488#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9489 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9490 ? (uint16_t)(a_uDword) \
9491 : (uint16_t)0 )
9492
9493#ifdef IEM_WITHOUT_ASSEMBLY
9494
9495IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9496{
9497 RT_NOREF(pFpuState);
9498 RTUINT64U uSrc1 = { *puDst };
9499 RTUINT64U uSrc2 = { *puSrc };
9500 RTUINT64U uDst;
9501 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9502 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9503 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9504 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9505 *puDst = uDst.u;
9506}
9507
9508
9509IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9510{
9511 RT_NOREF(pFpuState);
9512 RTUINT128U uSrc1 = *puDst;
9513 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9514 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9515 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9516 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9517 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9518 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9519 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9520 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9521}
9522
9523#endif
9524
9525IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9526 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9527{
9528 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9529 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9530 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9531 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9532 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9533 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9534 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9535 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9536}
9537
9538IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9539 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9540{
9541 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9542 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9543 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9544 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9545 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9546 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9547 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9548 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9549 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9550 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9551 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9552 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9553 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9554 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9555 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9556 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9557}
9558
9559
9560
9561/*
9562 * PSUBD / VPSUBD.
9563 */
9564#ifdef IEM_WITHOUT_ASSEMBLY
9565
9566IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9567{
9568 RT_NOREF(pFpuState);
9569 RTUINT64U uSrc1 = { *puDst };
9570 RTUINT64U uSrc2 = { *puSrc };
9571 RTUINT64U uDst;
9572 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9573 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9574 *puDst = uDst.u;
9575}
9576
9577
9578IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9579{
9580 RT_NOREF(pFpuState);
9581 RTUINT128U uSrc1 = *puDst;
9582 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9583 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9584 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9585 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9586}
9587
9588#endif /* IEM_WITHOUT_ASSEMBLY */
9589
9590IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9591 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9592{
9593 RT_NOREF(pExtState);
9594 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9595 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9596 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9597 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9598}
9599
9600IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9601 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9602{
9603 RT_NOREF(pExtState);
9604 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9605 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9606 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9607 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9608 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9609 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9610 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9611 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9612}
9613
9614
9615/*
9616 * PSUBQ / VPSUBQ.
9617 */
9618#ifdef IEM_WITHOUT_ASSEMBLY
9619
9620IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9621{
9622 RT_NOREF(pFpuState);
9623 *puDst = *puDst - *puSrc;
9624}
9625
9626IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9627{
9628 RT_NOREF(pFpuState);
9629 RTUINT128U uSrc1 = *puDst;
9630 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9631 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9632}
9633
9634#endif
9635
9636IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9637 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9638{
9639 RT_NOREF(pExtState);
9640 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9641 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9642}
9643
9644IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9645 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9646{
9647 RT_NOREF(pExtState);
9648 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9649 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9650 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9651 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9652}
9653
9654
9655
9656/*
9657 * PMULLW / VPMULLW / PMULLD / VPMULLD
9658 */
9659#ifdef IEM_WITHOUT_ASSEMBLY
9660
9661IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9662{
9663 RT_NOREF(pFpuState);
9664 RTUINT64U uSrc1 = { *puDst };
9665 RTUINT64U uSrc2 = { *puSrc };
9666 RTUINT64U uDst;
9667 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9668 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9669 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9670 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9671 *puDst = uDst.u;
9672}
9673
9674
9675IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9676{
9677 RT_NOREF(pFpuState);
9678 RTUINT128U uSrc1 = *puDst;
9679 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9680 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9681 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9682 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9683 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9684 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9685 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9686 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9687}
9688
9689#endif
9690
9691IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9692{
9693 RTUINT128U uSrc1 = *puDst;
9694
9695 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9696 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9697 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9698 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9699 RT_NOREF(pFpuState);
9700}
9701
9702
9703IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9704{
9705 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9706 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9707 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9708 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9709 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9710 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9711 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9712 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9713}
9714
9715
9716IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9717{
9718 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9719 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9720 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9721 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9722 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9723 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9724 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9725 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9726 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9727 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9728 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9729 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9730 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9731 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9732 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9733 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9734}
9735
9736
9737IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9738{
9739 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9740 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9741 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9742 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9743}
9744
9745
9746IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9747{
9748 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9749 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9750 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9751 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9752 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9753 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9754 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9755 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9756}
9757
9758
9759/*
9760 * PMULHW / VPMULHW
9761 */
9762#ifdef IEM_WITHOUT_ASSEMBLY
9763
9764IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9765{
9766 RT_NOREF(pFpuState);
9767 RTUINT64U uSrc1 = { *puDst };
9768 RTUINT64U uSrc2 = { *puSrc };
9769 RTUINT64U uDst;
9770 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9771 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9772 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9773 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9774 *puDst = uDst.u;
9775}
9776
9777
9778IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9779{
9780 RT_NOREF(pFpuState);
9781 RTUINT128U uSrc1 = *puDst;
9782 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9783 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9784 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9785 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9786 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9787 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9788 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9789 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9790}
9791
9792#endif
9793
9794IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9795{
9796 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9797 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9798 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9799 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9800 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9801 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9802 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9803 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9804}
9805
9806
9807IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9808{
9809 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9810 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9811 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9812 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9813 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9814 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9815 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9816 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9817 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9818 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9819 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9820 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9821 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9822 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9823 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9824 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9825}
9826
9827
9828/*
9829 * PMULHUW / VPMULHUW
9830 */
9831#ifdef IEM_WITHOUT_ASSEMBLY
9832
9833IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9834{
9835 RTUINT64U uSrc1 = { *puDst };
9836 RTUINT64U uSrc2 = { *puSrc };
9837 RTUINT64U uDst;
9838 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9839 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9840 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9841 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9842 *puDst = uDst.u;
9843}
9844
9845
9846IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9847{
9848 RTUINT128U uSrc1 = *puDst;
9849 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9850 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9851 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9852 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9853 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9854 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9855 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9856 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9857}
9858
9859#endif
9860
9861IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9862{
9863 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9864 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9865 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9866 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9867 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9868 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9869 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9870 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9871}
9872
9873
9874IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9875{
9876 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9877 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9878 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9879 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9880 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9881 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9882 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9883 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9884 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9885 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9886 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9887 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9888 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9889 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9890 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9891 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9892}
9893
9894
9895/*
9896 * PSRLW / VPSRLW
9897 */
9898#ifdef IEM_WITHOUT_ASSEMBLY
9899
9900IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9901{
9902 RTUINT64U uSrc1 = { *puDst };
9903 RTUINT64U uSrc2 = { *puSrc };
9904 RTUINT64U uDst;
9905
9906 if (uSrc2.au64[0] <= 15)
9907 {
9908 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9909 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9910 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9911 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9912 }
9913 else
9914 {
9915 uDst.au64[0] = 0;
9916 }
9917 *puDst = uDst.u;
9918}
9919
9920
9921IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9922{
9923 RTUINT64U uSrc1 = { *puDst };
9924 RTUINT64U uDst;
9925
9926 if (uShift <= 15)
9927 {
9928 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9929 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9930 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9931 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9932 }
9933 else
9934 {
9935 uDst.au64[0] = 0;
9936 }
9937 *puDst = uDst.u;
9938}
9939
9940
9941IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9942{
9943 RTUINT128U uSrc1 = *puDst;
9944
9945 if (puSrc->au64[0] <= 15)
9946 {
9947 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9948 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9949 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9950 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9951 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9952 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9953 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9954 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9955 }
9956 else
9957 {
9958 puDst->au64[0] = 0;
9959 puDst->au64[1] = 0;
9960 }
9961}
9962
9963IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9964{
9965 RTUINT128U uSrc1 = *puDst;
9966
9967 if (uShift <= 15)
9968 {
9969 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9970 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9971 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9972 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9973 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9974 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9975 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9976 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9977 }
9978 else
9979 {
9980 puDst->au64[0] = 0;
9981 puDst->au64[1] = 0;
9982 }
9983}
9984
9985#endif
9986
9987
9988/*
9989 * PSRAW / VPSRAW
9990 */
9991#ifdef IEM_WITHOUT_ASSEMBLY
9992
9993IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9994{
9995 RTUINT64U uSrc1 = { *puDst };
9996 RTUINT64U uSrc2 = { *puSrc };
9997 RTUINT64U uDst;
9998
9999 if (uSrc2.au64[0] <= 15)
10000 {
10001 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
10002 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
10003 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
10004 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
10005 }
10006 else
10007 {
10008 uDst.au64[0] = 0;
10009 }
10010 *puDst = uDst.u;
10011}
10012
10013
10014IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10015{
10016 RTUINT64U uSrc1 = { *puDst };
10017 RTUINT64U uDst;
10018
10019 if (uShift <= 15)
10020 {
10021 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10022 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10023 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10024 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10025 }
10026 else
10027 {
10028 uDst.au64[0] = 0;
10029 }
10030 *puDst = uDst.u;
10031}
10032
10033
10034IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10035{
10036 RTUINT128U uSrc1 = *puDst;
10037
10038 if (puSrc->au64[0] <= 15)
10039 {
10040 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
10041 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
10042 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
10043 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
10044 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
10045 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
10046 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
10047 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
10048 }
10049 else
10050 {
10051 puDst->au64[0] = 0;
10052 puDst->au64[1] = 0;
10053 }
10054}
10055
10056IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10057{
10058 RTUINT128U uSrc1 = *puDst;
10059
10060 if (uShift <= 15)
10061 {
10062 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10063 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10064 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10065 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10066 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10067 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10068 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10069 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10070 }
10071 else
10072 {
10073 puDst->au64[0] = 0;
10074 puDst->au64[1] = 0;
10075 }
10076}
10077
10078#endif
10079
10080
10081/*
10082 * PSLLW / VPSLLW
10083 */
10084#ifdef IEM_WITHOUT_ASSEMBLY
10085
10086IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10087{
10088 RTUINT64U uSrc1 = { *puDst };
10089 RTUINT64U uSrc2 = { *puSrc };
10090 RTUINT64U uDst;
10091
10092 if (uSrc2.au64[0] <= 15)
10093 {
10094 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10095 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10096 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10097 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10098 }
10099 else
10100 {
10101 uDst.au64[0] = 0;
10102 }
10103 *puDst = uDst.u;
10104}
10105
10106
10107IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10108{
10109 RTUINT64U uSrc1 = { *puDst };
10110 RTUINT64U uDst;
10111
10112 if (uShift <= 15)
10113 {
10114 uDst.au16[0] = uSrc1.au16[0] << uShift;
10115 uDst.au16[1] = uSrc1.au16[1] << uShift;
10116 uDst.au16[2] = uSrc1.au16[2] << uShift;
10117 uDst.au16[3] = uSrc1.au16[3] << uShift;
10118 }
10119 else
10120 {
10121 uDst.au64[0] = 0;
10122 }
10123 *puDst = uDst.u;
10124}
10125
10126
10127IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10128{
10129 RTUINT128U uSrc1 = *puDst;
10130
10131 if (puSrc->au64[0] <= 15)
10132 {
10133 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10134 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10135 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10136 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10137 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10138 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10139 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10140 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10141 }
10142 else
10143 {
10144 puDst->au64[0] = 0;
10145 puDst->au64[1] = 0;
10146 }
10147}
10148
10149IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10150{
10151 RTUINT128U uSrc1 = *puDst;
10152
10153 if (uShift <= 15)
10154 {
10155 puDst->au16[0] = uSrc1.au16[0] << uShift;
10156 puDst->au16[1] = uSrc1.au16[1] << uShift;
10157 puDst->au16[2] = uSrc1.au16[2] << uShift;
10158 puDst->au16[3] = uSrc1.au16[3] << uShift;
10159 puDst->au16[4] = uSrc1.au16[4] << uShift;
10160 puDst->au16[5] = uSrc1.au16[5] << uShift;
10161 puDst->au16[6] = uSrc1.au16[6] << uShift;
10162 puDst->au16[7] = uSrc1.au16[7] << uShift;
10163 }
10164 else
10165 {
10166 puDst->au64[0] = 0;
10167 puDst->au64[1] = 0;
10168 }
10169}
10170
10171#endif
10172
10173
10174/*
10175 * PSRLD / VPSRLD
10176 */
10177#ifdef IEM_WITHOUT_ASSEMBLY
10178
10179IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10180{
10181 RTUINT64U uSrc1 = { *puDst };
10182 RTUINT64U uSrc2 = { *puSrc };
10183 RTUINT64U uDst;
10184
10185 if (uSrc2.au64[0] <= 31)
10186 {
10187 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10188 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10189 }
10190 else
10191 {
10192 uDst.au64[0] = 0;
10193 }
10194 *puDst = uDst.u;
10195}
10196
10197
10198IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10199{
10200 RTUINT64U uSrc1 = { *puDst };
10201 RTUINT64U uDst;
10202
10203 if (uShift <= 31)
10204 {
10205 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10206 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10207 }
10208 else
10209 {
10210 uDst.au64[0] = 0;
10211 }
10212 *puDst = uDst.u;
10213}
10214
10215
10216IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10217{
10218 RTUINT128U uSrc1 = *puDst;
10219
10220 if (puSrc->au64[0] <= 31)
10221 {
10222 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10223 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10224 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10225 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10226 }
10227 else
10228 {
10229 puDst->au64[0] = 0;
10230 puDst->au64[1] = 0;
10231 }
10232}
10233
10234IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10235{
10236 RTUINT128U uSrc1 = *puDst;
10237
10238 if (uShift <= 31)
10239 {
10240 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10241 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10242 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10243 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10244 }
10245 else
10246 {
10247 puDst->au64[0] = 0;
10248 puDst->au64[1] = 0;
10249 }
10250}
10251
10252#endif
10253
10254
10255/*
10256 * PSRAD / VPSRAD
10257 */
10258#ifdef IEM_WITHOUT_ASSEMBLY
10259
10260IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10261{
10262 RTUINT64U uSrc1 = { *puDst };
10263 RTUINT64U uSrc2 = { *puSrc };
10264 RTUINT64U uDst;
10265
10266 if (uSrc2.au64[0] <= 31)
10267 {
10268 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
10269 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
10270 }
10271 else
10272 {
10273 uDst.au64[0] = 0;
10274 }
10275 *puDst = uDst.u;
10276}
10277
10278
10279IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10280{
10281 RTUINT64U uSrc1 = { *puDst };
10282 RTUINT64U uDst;
10283
10284 if (uShift <= 31)
10285 {
10286 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10287 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10288 }
10289 else
10290 {
10291 uDst.au64[0] = 0;
10292 }
10293 *puDst = uDst.u;
10294}
10295
10296
10297IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10298{
10299 RTUINT128U uSrc1 = *puDst;
10300
10301 if (puSrc->au64[0] <= 31)
10302 {
10303 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
10304 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
10305 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
10306 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
10307 }
10308 else
10309 {
10310 puDst->au64[0] = 0;
10311 puDst->au64[1] = 0;
10312 }
10313}
10314
10315IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10316{
10317 RTUINT128U uSrc1 = *puDst;
10318
10319 if (uShift <= 31)
10320 {
10321 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10322 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10323 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10324 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10325 }
10326 else
10327 {
10328 puDst->au64[0] = 0;
10329 puDst->au64[1] = 0;
10330 }
10331}
10332
10333#endif
10334
10335
10336/*
10337 * PSLLD / VPSLLD
10338 */
10339#ifdef IEM_WITHOUT_ASSEMBLY
10340
10341IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10342{
10343 RTUINT64U uSrc1 = { *puDst };
10344 RTUINT64U uSrc2 = { *puSrc };
10345 RTUINT64U uDst;
10346
10347 if (uSrc2.au64[0] <= 31)
10348 {
10349 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10350 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10351 }
10352 else
10353 {
10354 uDst.au64[0] = 0;
10355 }
10356 *puDst = uDst.u;
10357}
10358
10359
10360IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10361{
10362 RTUINT64U uSrc1 = { *puDst };
10363 RTUINT64U uDst;
10364
10365 if (uShift <= 31)
10366 {
10367 uDst.au32[0] = uSrc1.au32[0] << uShift;
10368 uDst.au32[1] = uSrc1.au32[1] << uShift;
10369 }
10370 else
10371 {
10372 uDst.au64[0] = 0;
10373 }
10374 *puDst = uDst.u;
10375}
10376
10377
10378IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10379{
10380 RTUINT128U uSrc1 = *puDst;
10381
10382 if (puSrc->au64[0] <= 31)
10383 {
10384 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10385 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10386 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10387 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10388 }
10389 else
10390 {
10391 puDst->au64[0] = 0;
10392 puDst->au64[1] = 0;
10393 }
10394}
10395
10396IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10397{
10398 RTUINT128U uSrc1 = *puDst;
10399
10400 if (uShift <= 31)
10401 {
10402 puDst->au32[0] = uSrc1.au32[0] << uShift;
10403 puDst->au32[1] = uSrc1.au32[1] << uShift;
10404 puDst->au32[2] = uSrc1.au32[2] << uShift;
10405 puDst->au32[3] = uSrc1.au32[3] << uShift;
10406 }
10407 else
10408 {
10409 puDst->au64[0] = 0;
10410 puDst->au64[1] = 0;
10411 }
10412}
10413
10414#endif
10415
10416
10417/*
10418 * PSRLQ / VPSRLQ
10419 */
10420#ifdef IEM_WITHOUT_ASSEMBLY
10421
10422IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10423{
10424 RTUINT64U uSrc1 = { *puDst };
10425 RTUINT64U uSrc2 = { *puSrc };
10426 RTUINT64U uDst;
10427
10428 if (uSrc2.au64[0] <= 63)
10429 {
10430 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10431 }
10432 else
10433 {
10434 uDst.au64[0] = 0;
10435 }
10436 *puDst = uDst.u;
10437}
10438
10439
10440IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10441{
10442 RTUINT64U uSrc1 = { *puDst };
10443 RTUINT64U uDst;
10444
10445 if (uShift <= 63)
10446 {
10447 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10448 }
10449 else
10450 {
10451 uDst.au64[0] = 0;
10452 }
10453 *puDst = uDst.u;
10454}
10455
10456
10457IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10458{
10459 RTUINT128U uSrc1 = *puDst;
10460
10461 if (puSrc->au64[0] <= 63)
10462 {
10463 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10464 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10465 }
10466 else
10467 {
10468 puDst->au64[0] = 0;
10469 puDst->au64[1] = 0;
10470 }
10471}
10472
10473IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10474{
10475 RTUINT128U uSrc1 = *puDst;
10476
10477 if (uShift <= 63)
10478 {
10479 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10480 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10481 }
10482 else
10483 {
10484 puDst->au64[0] = 0;
10485 puDst->au64[1] = 0;
10486 }
10487}
10488
10489#endif
10490
10491
10492/*
10493 * PSLLQ / VPSLLQ
10494 */
10495#ifdef IEM_WITHOUT_ASSEMBLY
10496
10497IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10498{
10499 RTUINT64U uSrc1 = { *puDst };
10500 RTUINT64U uSrc2 = { *puSrc };
10501 RTUINT64U uDst;
10502
10503 if (uSrc2.au64[0] <= 63)
10504 {
10505 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10506 }
10507 else
10508 {
10509 uDst.au64[0] = 0;
10510 }
10511 *puDst = uDst.u;
10512}
10513
10514
10515IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10516{
10517 RTUINT64U uSrc1 = { *puDst };
10518 RTUINT64U uDst;
10519
10520 if (uShift <= 63)
10521 {
10522 uDst.au64[0] = uSrc1.au64[0] << uShift;
10523 }
10524 else
10525 {
10526 uDst.au64[0] = 0;
10527 }
10528 *puDst = uDst.u;
10529}
10530
10531
10532IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10533{
10534 RTUINT128U uSrc1 = *puDst;
10535
10536 if (puSrc->au64[0] <= 63)
10537 {
10538 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10539 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10540 }
10541 else
10542 {
10543 puDst->au64[0] = 0;
10544 puDst->au64[1] = 0;
10545 }
10546}
10547
10548IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10549{
10550 RTUINT128U uSrc1 = *puDst;
10551
10552 if (uShift <= 63)
10553 {
10554 puDst->au64[0] = uSrc1.au64[0] << uShift;
10555 puDst->au64[1] = uSrc1.au64[1] << uShift;
10556 }
10557 else
10558 {
10559 puDst->au64[0] = 0;
10560 puDst->au64[1] = 0;
10561 }
10562}
10563
10564#endif
10565
10566
10567/*
10568 * PSRLDQ / VPSRLDQ
10569 */
10570#ifdef IEM_WITHOUT_ASSEMBLY
10571
10572IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10573{
10574 RTUINT128U uSrc1 = *puDst;
10575
10576 if (uShift < 16)
10577 {
10578 int i;
10579
10580 for (i = 0; i < 16 - uShift; ++i)
10581 puDst->au8[i] = uSrc1.au8[i + uShift];
10582 for (i = 16 - uShift; i < 16; ++i)
10583 puDst->au8[i] = 0;
10584 }
10585 else
10586 {
10587 puDst->au64[0] = 0;
10588 puDst->au64[1] = 0;
10589 }
10590}
10591
10592#endif
10593
10594
10595/*
10596 * PSLLDQ / VPSLLDQ
10597 */
10598#ifdef IEM_WITHOUT_ASSEMBLY
10599
10600IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10601{
10602 RTUINT128U uSrc1 = *puDst;
10603
10604 if (uShift < 16)
10605 {
10606 int i;
10607
10608 for (i = 0; i < uShift; ++i)
10609 puDst->au8[i] = 0;
10610 for (i = uShift; i < 16; ++i)
10611 puDst->au8[i] = uSrc1.au8[i - uShift];
10612 }
10613 else
10614 {
10615 puDst->au64[0] = 0;
10616 puDst->au64[1] = 0;
10617 }
10618}
10619
10620#endif
10621
10622
10623/*
10624 * PMADDWD / VPMADDWD
10625 */
10626#ifdef IEM_WITHOUT_ASSEMBLY
10627
10628IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10629{
10630 RTUINT64U uSrc1 = { *puDst };
10631 RTUINT64U uSrc2 = { *puSrc };
10632 RTUINT64U uDst;
10633
10634 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10635 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10636 *puDst = uDst.u;
10637 RT_NOREF(pFpuState);
10638}
10639
10640
10641IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10642{
10643 RTUINT128U uSrc1 = *puDst;
10644
10645 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10646 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10647 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10648 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10649 RT_NOREF(pFpuState);
10650}
10651
10652#endif
10653
10654
10655/*
10656 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10657 */
10658#ifdef IEM_WITHOUT_ASSEMBLY
10659
10660IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10661{
10662 RTUINT64U uSrc1 = { *puDst };
10663 RTUINT64U uSrc2 = { *puSrc };
10664 RTUINT64U uDst;
10665
10666 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10667 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10668 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10669 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10670 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10671 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10672 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10673 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10674 *puDst = uDst.u;
10675 RT_NOREF(pFpuState);
10676}
10677
10678
10679IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10680{
10681 RTUINT128U uSrc1 = *puDst;
10682
10683 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10684 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10685 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10686 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10687 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10688 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10689 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10690 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10691 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10692 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10693 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10694 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10695 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10696 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10697 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10698 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10699 RT_NOREF(pFpuState);
10700}
10701
10702#endif
10703
10704
10705IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10706{
10707 RTUINT128U uSrc1 = *puDst;
10708
10709 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10710 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10711 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10712 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10713 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10714 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10715 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10716 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10717 RT_NOREF(pFpuState);
10718}
10719
10720
10721IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10722{
10723 RTUINT128U uSrc1 = *puDst;
10724
10725 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10726 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10727 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10728 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10729 RT_NOREF(pFpuState);
10730}
10731
10732
10733IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10734 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10735{
10736 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10737 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10738 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10739 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10740 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10741 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10742 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10743 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10744 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10745 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10746 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10747 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10748 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10749 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10750 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10751 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10752 RT_NOREF(pExtState);
10753}
10754
10755
10756IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10757 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10758{
10759 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10760 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10761 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10762 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10763 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10764 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10765 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10766 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10767 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10768 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10769 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10770 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10771 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10772 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10773 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10774 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10775 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10776 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10777 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10778 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10779 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10780 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10781 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10782 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10783 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10784 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10785 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10786 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10787 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10788 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10789 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10790 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10791 RT_NOREF(pExtState);
10792}
10793
10794
10795IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10796 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10797{
10798 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10799 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10800 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10801 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10802 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10803 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10804 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10805 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10806 RT_NOREF(pExtState);
10807}
10808
10809
10810IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10811 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10812{
10813 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10814 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10815 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10816 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10817 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10818 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10819 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10820 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10821 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10822 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10823 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10824 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10825 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10826 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10827 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10828 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10829 RT_NOREF(pExtState);
10830}
10831
10832
10833IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10834 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10835{
10836 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10837 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10838 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10839 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10840 RT_NOREF(pExtState);
10841}
10842
10843
10844IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10845 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10846{
10847 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10848 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10849 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10850 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10851 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10852 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10853 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10854 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10855 RT_NOREF(pExtState);
10856}
10857
10858
10859/*
10860 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10861 */
10862#ifdef IEM_WITHOUT_ASSEMBLY
10863
10864IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10865{
10866 RTUINT64U uSrc1 = { *puDst };
10867 RTUINT64U uSrc2 = { *puSrc };
10868 RTUINT64U uDst;
10869
10870 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10871 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10872 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10873 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10874 *puDst = uDst.u;
10875 RT_NOREF(pFpuState);
10876}
10877
10878
10879IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10880{
10881 RTUINT128U uSrc1 = *puDst;
10882
10883 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10884 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10885 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10886 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10887 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10888 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10889 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10890 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10891 RT_NOREF(pFpuState);
10892}
10893
10894#endif
10895
10896IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10897{
10898 RTUINT128U uSrc1 = *puDst;
10899
10900 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10901 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10902 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10903 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10904 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10905 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10906 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10907 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10908 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10909 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10910 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10911 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10912 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10913 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10914 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10915 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10916 RT_NOREF(pFpuState);
10917}
10918
10919
10920IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10921{
10922 RTUINT128U uSrc1 = *puDst;
10923
10924 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10925 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10926 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10927 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10928 RT_NOREF(pFpuState);
10929}
10930
10931
10932IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10933 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10934{
10935 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10936 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10937 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10938 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10939 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10940 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10941 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10942 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10943 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10944 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10945 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10946 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10947 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10948 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10949 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10950 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10951 RT_NOREF(pExtState);
10952}
10953
10954
10955IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10956 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10957{
10958 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10959 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10960 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10961 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10962 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10963 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10964 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10965 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10966 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10967 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10968 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10969 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10970 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10971 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10972 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10973 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10974 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10975 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10976 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10977 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10978 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10979 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10980 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10981 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10982 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10983 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10984 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10985 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10986 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10987 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10988 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10989 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10990 RT_NOREF(pExtState);
10991}
10992
10993
10994IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10995 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10996{
10997 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10998 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10999 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11000 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11001 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11002 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11003 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11004 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11005 RT_NOREF(pExtState);
11006}
11007
11008
11009IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11010 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11011{
11012 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11013 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11014 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11015 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11016 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11017 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11018 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11019 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11020 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11021 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11022 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11023 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11024 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11025 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11026 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11027 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11028 RT_NOREF(pExtState);
11029}
11030
11031
11032IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11033 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11034{
11035 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11036 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11037 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11038 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11039 RT_NOREF(pExtState);
11040}
11041
11042
11043IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11044 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11045{
11046 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11047 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11048 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11049 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11050 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11051 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11052 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11053 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11054 RT_NOREF(pExtState);
11055}
11056
11057
11058/*
11059 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11060 */
11061#ifdef IEM_WITHOUT_ASSEMBLY
11062
11063IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11064{
11065 RTUINT64U uSrc1 = { *puDst };
11066 RTUINT64U uSrc2 = { *puSrc };
11067 RTUINT64U uDst;
11068
11069 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11070 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11071 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11072 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11073 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11074 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11075 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11076 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11077 *puDst = uDst.u;
11078 RT_NOREF(pFpuState);
11079}
11080
11081
11082IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11083{
11084 RTUINT128U uSrc1 = *puDst;
11085
11086 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11087 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11088 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11089 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11090 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11091 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11092 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11093 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11094 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11095 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11096 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11097 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11098 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11099 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11100 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11101 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11102 RT_NOREF(pFpuState);
11103}
11104
11105#endif
11106
11107IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11108{
11109 RTUINT128U uSrc1 = *puDst;
11110
11111 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11112 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11113 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11114 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11115 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11116 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11117 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11118 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11119 RT_NOREF(pFpuState);
11120}
11121
11122
11123IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11124{
11125 RTUINT128U uSrc1 = *puDst;
11126
11127 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11128 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11129 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11130 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11131 RT_NOREF(pFpuState);
11132}
11133
11134
11135IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11136 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11137{
11138 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11139 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11140 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11141 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11142 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11143 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11144 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11145 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11146 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11147 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11148 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11149 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11150 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11151 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11152 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11153 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11154 RT_NOREF(pExtState);
11155}
11156
11157
11158IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11159 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11160{
11161 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11162 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11163 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11164 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11165 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11166 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11167 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11168 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11169 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11170 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11171 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11172 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11173 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11174 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11175 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11176 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11177 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11178 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11179 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11180 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11181 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11182 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11183 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11184 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11185 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11186 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11187 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11188 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11189 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11190 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11191 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11192 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11193 RT_NOREF(pExtState);
11194}
11195
11196
11197IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11198 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11199{
11200 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11201 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11202 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11203 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11204 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11205 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11206 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11207 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11208 RT_NOREF(pExtState);
11209}
11210
11211
11212IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11213 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11214{
11215 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11216 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11217 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11218 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11219 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11220 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11221 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11222 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11223 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11224 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11225 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11226 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11227 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11228 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11229 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11230 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11231 RT_NOREF(pExtState);
11232}
11233
11234
11235IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11236 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11237{
11238 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11239 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11240 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11241 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11242 RT_NOREF(pExtState);
11243}
11244
11245
11246IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11247 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11248{
11249 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11250 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11251 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11252 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11253 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11254 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11255 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11256 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11257 RT_NOREF(pExtState);
11258}
11259
11260
11261/*
11262 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11263 */
11264#ifdef IEM_WITHOUT_ASSEMBLY
11265
11266IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11267{
11268 RTUINT64U uSrc1 = { *puDst };
11269 RTUINT64U uSrc2 = { *puSrc };
11270 RTUINT64U uDst;
11271
11272 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11273 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11274 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11275 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11276 *puDst = uDst.u;
11277 RT_NOREF(pFpuState);
11278}
11279
11280
11281IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11282{
11283 RTUINT128U uSrc1 = *puDst;
11284
11285 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11286 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11287 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11288 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11289 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11290 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11291 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11292 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11293 RT_NOREF(pFpuState);
11294}
11295
11296#endif
11297
11298IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11299{
11300 RTUINT128U uSrc1 = *puDst;
11301
11302 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11303 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11304 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11305 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11306 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11307 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11308 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11309 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11310 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11311 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11312 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11313 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11314 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11315 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11316 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
11317 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
11318 RT_NOREF(pFpuState);
11319}
11320
11321
11322IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11323{
11324 RTUINT128U uSrc1 = *puDst;
11325
11326 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11327 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11328 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11329 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11330 RT_NOREF(pFpuState);
11331}
11332
11333
11334IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11335 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11336{
11337 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11338 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11339 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11340 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11341 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11342 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11343 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11344 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11345 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11346 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11347 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11348 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11349 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11350 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11351 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11352 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11353 RT_NOREF(pExtState);
11354}
11355
11356
11357IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11358 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11359{
11360 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11361 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11362 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11363 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11364 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11365 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11366 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11367 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11368 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11369 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11370 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11371 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11372 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11373 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11374 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11375 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11376 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11377 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11378 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11379 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11380 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11381 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11382 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11383 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11384 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11385 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11386 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11387 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11388 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11389 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11390 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11391 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11392 RT_NOREF(pExtState);
11393}
11394
11395
11396IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11397 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11398{
11399 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11400 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11401 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11402 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11403 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11404 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11405 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11406 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11407 RT_NOREF(pExtState);
11408}
11409
11410
11411IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11412 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11413{
11414 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11415 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11416 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11417 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11418 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11419 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11420 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11421 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11422 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11423 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11424 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
11425 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
11426 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
11427 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
11428 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
11429 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
11430 RT_NOREF(pExtState);
11431}
11432
11433
11434IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11435 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11436{
11437 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11438 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11439 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11440 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11441 RT_NOREF(pExtState);
11442}
11443
11444
11445IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11446 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11447{
11448 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11449 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11450 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11451 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11452 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11453 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11454 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11455 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11456 RT_NOREF(pExtState);
11457}
11458
11459
11460/*
11461 * PAVGB / VPAVGB / PAVGW / VPAVGW
11462 */
11463#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11464#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11465
11466#ifdef IEM_WITHOUT_ASSEMBLY
11467
11468IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11469{
11470 RTUINT64U uSrc1 = { *puDst };
11471 RTUINT64U uSrc2 = { *puSrc };
11472 RTUINT64U uDst;
11473
11474 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11475 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11476 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11477 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11478 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11479 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11480 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11481 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11482 *puDst = uDst.u;
11483}
11484
11485
11486IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11487{
11488 RTUINT128U uSrc1 = *puDst;
11489
11490 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11491 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11492 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11493 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11494 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11495 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11496 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11497 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11498 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11499 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11500 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11501 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11502 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11503 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11504 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11505 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11506}
11507
11508
11509IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11510{
11511 RTUINT64U uSrc1 = { *puDst };
11512 RTUINT64U uSrc2 = { *puSrc };
11513 RTUINT64U uDst;
11514
11515 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11516 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11517 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11518 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11519 *puDst = uDst.u;
11520}
11521
11522
11523IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11524{
11525 RTUINT128U uSrc1 = *puDst;
11526
11527 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11528 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11529 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11530 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11531 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11532 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11533 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11534 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11535}
11536
11537#endif
11538
11539IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11540{
11541 RTUINT128U uSrc1 = *puDst;
11542
11543 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11544 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11545 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11546 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11547 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11548 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11549 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11550 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11551 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11552 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11553 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11554 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11555 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11556 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11557 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11558 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11559}
11560
11561
11562IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11563{
11564 RTUINT128U uSrc1 = *puDst;
11565
11566 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11567 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11568 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11569 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11570 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11571 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11572 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11573 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11574 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11575 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11576 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11577 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11578 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11579 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11580 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11581 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11582}
11583
11584
11585IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11586{
11587 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11588 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11589 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11590 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11591 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11592 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11593 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11594 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11595 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11596 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11597 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11598 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11599 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11600 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11601 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11602 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11603}
11604
11605
11606IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11607{
11608 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11609 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11610 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11611 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11612 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11613 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11614 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11615 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11616 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11617 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11618 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11619 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11620 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11621 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11622 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11623 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11624 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11625 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11626 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11627 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11628 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11629 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11630 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11631 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11632 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11633 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11634 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11635 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11636 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11637 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11638 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11639 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11640}
11641
11642
11643IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11644{
11645 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11646 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11647 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11648 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11649 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11650 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11651 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11652 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11653}
11654
11655
11656IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11657{
11658 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11659 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11660 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11661 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11662 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11663 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11664 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11665 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11666 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11667 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11668 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11669 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11670 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11671 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11672 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11673 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11674}
11675
11676#undef PAVGB_EXEC
11677#undef PAVGW_EXEC
11678
11679
11680/*
11681 * PMOVMSKB / VPMOVMSKB
11682 */
11683#ifdef IEM_WITHOUT_ASSEMBLY
11684
11685IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11686{
11687 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11688 uint64_t const uSrc = *pu64Src;
11689 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11690 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11691 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11692 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11693 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11694 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11695 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11696 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11697}
11698
11699
11700IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11701{
11702 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11703 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11704 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11705 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11706 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11707 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11708 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11709 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11710 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11711 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11712 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11713 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11714 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11715 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11716 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11717 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11718 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11719 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11720 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11721}
11722
11723#endif
11724
11725IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11726{
11727 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11728 uint64_t const uSrc0 = puSrc->QWords.qw0;
11729 uint64_t const uSrc1 = puSrc->QWords.qw1;
11730 uint64_t const uSrc2 = puSrc->QWords.qw2;
11731 uint64_t const uSrc3 = puSrc->QWords.qw3;
11732 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11733 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11734 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11735 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11736 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11737 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11738 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11739 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11740 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11741 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11742 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11743 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11744 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11745 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11746 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11747 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11748 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11749 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11750 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11751 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11752 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11753 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11754 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11755 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11756 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11757 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11758 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11759 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11760 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11761 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11762 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11763 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11764}
11765
11766
11767/*
11768 * [V]PSHUFB
11769 */
11770
11771IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11772{
11773 RTUINT64U const uSrc = { *puSrc };
11774 RTUINT64U const uDstIn = { *puDst };
11775 ASMCompilerBarrier();
11776 RTUINT64U uDstOut = { 0 };
11777 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11778 {
11779 uint8_t idxSrc = uSrc.au8[iByte];
11780 if (!(idxSrc & 0x80))
11781 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11782 }
11783 *puDst = uDstOut.u;
11784 RT_NOREF(pFpuState);
11785}
11786
11787
11788IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11789{
11790 RTUINT128U const uSrc = *puSrc;
11791 RTUINT128U const uDstIn = *puDst;
11792 ASMCompilerBarrier();
11793 puDst->au64[0] = 0;
11794 puDst->au64[1] = 0;
11795 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11796 {
11797 uint8_t idxSrc = uSrc.au8[iByte];
11798 if (!(idxSrc & 0x80))
11799 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11800 }
11801 RT_NOREF(pFpuState);
11802}
11803
11804
11805IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11806 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11807{
11808 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11809 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11810 ASMCompilerBarrier();
11811 puDst->au64[0] = 0;
11812 puDst->au64[1] = 0;
11813 for (unsigned iByte = 0; iByte < 16; iByte++)
11814 {
11815 uint8_t idxSrc = uSrc2.au8[iByte];
11816 if (!(idxSrc & 0x80))
11817 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11818 }
11819 RT_NOREF(pExtState);
11820}
11821
11822
11823IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11824 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11825{
11826 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11827 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11828 ASMCompilerBarrier();
11829 puDst->au64[0] = 0;
11830 puDst->au64[1] = 0;
11831 puDst->au64[2] = 0;
11832 puDst->au64[3] = 0;
11833 for (unsigned iByte = 0; iByte < 16; iByte++)
11834 {
11835 uint8_t idxSrc = uSrc2.au8[iByte];
11836 if (!(idxSrc & 0x80))
11837 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11838 }
11839 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11840 {
11841 uint8_t idxSrc = uSrc2.au8[iByte];
11842 if (!(idxSrc & 0x80))
11843 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11844 }
11845 RT_NOREF(pExtState);
11846}
11847
11848
11849/*
11850 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11851 */
11852#ifdef IEM_WITHOUT_ASSEMBLY
11853
11854IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11855{
11856 uint64_t const uSrc = *puSrc;
11857 ASMCompilerBarrier();
11858 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11859 uSrc >> (((bEvil >> 2) & 3) * 16),
11860 uSrc >> (((bEvil >> 4) & 3) * 16),
11861 uSrc >> (((bEvil >> 6) & 3) * 16));
11862}
11863
11864
11865IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11866{
11867 puDst->QWords.qw0 = puSrc->QWords.qw0;
11868 uint64_t const uSrc = puSrc->QWords.qw1;
11869 ASMCompilerBarrier();
11870 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11871 uSrc >> (((bEvil >> 2) & 3) * 16),
11872 uSrc >> (((bEvil >> 4) & 3) * 16),
11873 uSrc >> (((bEvil >> 6) & 3) * 16));
11874}
11875
11876#endif
11877
11878IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11879{
11880 puDst->QWords.qw0 = puSrc->QWords.qw0;
11881 uint64_t const uSrc1 = puSrc->QWords.qw1;
11882 puDst->QWords.qw2 = puSrc->QWords.qw2;
11883 uint64_t const uSrc3 = puSrc->QWords.qw3;
11884 ASMCompilerBarrier();
11885 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11886 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11887 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11888 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11889 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11890 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11891 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11892 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11893}
11894
11895#ifdef IEM_WITHOUT_ASSEMBLY
11896IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11897{
11898 puDst->QWords.qw1 = puSrc->QWords.qw1;
11899 uint64_t const uSrc = puSrc->QWords.qw0;
11900 ASMCompilerBarrier();
11901 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11902 uSrc >> (((bEvil >> 2) & 3) * 16),
11903 uSrc >> (((bEvil >> 4) & 3) * 16),
11904 uSrc >> (((bEvil >> 6) & 3) * 16));
11905
11906}
11907#endif
11908
11909
11910IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11911{
11912 puDst->QWords.qw3 = puSrc->QWords.qw3;
11913 uint64_t const uSrc2 = puSrc->QWords.qw2;
11914 puDst->QWords.qw1 = puSrc->QWords.qw1;
11915 uint64_t const uSrc0 = puSrc->QWords.qw0;
11916 ASMCompilerBarrier();
11917 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11918 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11919 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11920 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11921 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11922 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11923 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11924 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11925
11926}
11927
11928
11929#ifdef IEM_WITHOUT_ASSEMBLY
11930IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11931{
11932 RTUINT128U const uSrc = *puSrc;
11933 ASMCompilerBarrier();
11934 puDst->au32[0] = uSrc.au32[bEvil & 3];
11935 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11936 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11937 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11938}
11939#endif
11940
11941
11942IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11943{
11944 RTUINT256U const uSrc = *puSrc;
11945 ASMCompilerBarrier();
11946 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11947 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11948 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11949 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11950 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11951 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11952 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11953 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11954}
11955
11956
11957/*
11958 * PUNPCKHBW - high bytes -> words
11959 */
11960#ifdef IEM_WITHOUT_ASSEMBLY
11961
11962IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11963{
11964 RTUINT64U const uSrc2 = { *puSrc };
11965 RTUINT64U const uSrc1 = { *puDst };
11966 ASMCompilerBarrier();
11967 RTUINT64U uDstOut;
11968 uDstOut.au8[0] = uSrc1.au8[4];
11969 uDstOut.au8[1] = uSrc2.au8[4];
11970 uDstOut.au8[2] = uSrc1.au8[5];
11971 uDstOut.au8[3] = uSrc2.au8[5];
11972 uDstOut.au8[4] = uSrc1.au8[6];
11973 uDstOut.au8[5] = uSrc2.au8[6];
11974 uDstOut.au8[6] = uSrc1.au8[7];
11975 uDstOut.au8[7] = uSrc2.au8[7];
11976 *puDst = uDstOut.u;
11977}
11978
11979
11980IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11981{
11982 RTUINT128U const uSrc2 = *puSrc;
11983 RTUINT128U const uSrc1 = *puDst;
11984 ASMCompilerBarrier();
11985 RTUINT128U uDstOut;
11986 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11987 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11988 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11989 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11990 uDstOut.au8[ 4] = uSrc1.au8[10];
11991 uDstOut.au8[ 5] = uSrc2.au8[10];
11992 uDstOut.au8[ 6] = uSrc1.au8[11];
11993 uDstOut.au8[ 7] = uSrc2.au8[11];
11994 uDstOut.au8[ 8] = uSrc1.au8[12];
11995 uDstOut.au8[ 9] = uSrc2.au8[12];
11996 uDstOut.au8[10] = uSrc1.au8[13];
11997 uDstOut.au8[11] = uSrc2.au8[13];
11998 uDstOut.au8[12] = uSrc1.au8[14];
11999 uDstOut.au8[13] = uSrc2.au8[14];
12000 uDstOut.au8[14] = uSrc1.au8[15];
12001 uDstOut.au8[15] = uSrc2.au8[15];
12002 *puDst = uDstOut;
12003}
12004
12005#endif
12006
12007IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12008{
12009 RTUINT128U const uSrc2 = *puSrc2;
12010 RTUINT128U const uSrc1 = *puSrc1;
12011 ASMCompilerBarrier();
12012 RTUINT128U uDstOut;
12013 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12014 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12015 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12016 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12017 uDstOut.au8[ 4] = uSrc1.au8[10];
12018 uDstOut.au8[ 5] = uSrc2.au8[10];
12019 uDstOut.au8[ 6] = uSrc1.au8[11];
12020 uDstOut.au8[ 7] = uSrc2.au8[11];
12021 uDstOut.au8[ 8] = uSrc1.au8[12];
12022 uDstOut.au8[ 9] = uSrc2.au8[12];
12023 uDstOut.au8[10] = uSrc1.au8[13];
12024 uDstOut.au8[11] = uSrc2.au8[13];
12025 uDstOut.au8[12] = uSrc1.au8[14];
12026 uDstOut.au8[13] = uSrc2.au8[14];
12027 uDstOut.au8[14] = uSrc1.au8[15];
12028 uDstOut.au8[15] = uSrc2.au8[15];
12029 *puDst = uDstOut;
12030}
12031
12032
12033IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12034{
12035 RTUINT256U const uSrc2 = *puSrc2;
12036 RTUINT256U const uSrc1 = *puSrc1;
12037 ASMCompilerBarrier();
12038 RTUINT256U uDstOut;
12039 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12040 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12041 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12042 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12043 uDstOut.au8[ 4] = uSrc1.au8[10];
12044 uDstOut.au8[ 5] = uSrc2.au8[10];
12045 uDstOut.au8[ 6] = uSrc1.au8[11];
12046 uDstOut.au8[ 7] = uSrc2.au8[11];
12047 uDstOut.au8[ 8] = uSrc1.au8[12];
12048 uDstOut.au8[ 9] = uSrc2.au8[12];
12049 uDstOut.au8[10] = uSrc1.au8[13];
12050 uDstOut.au8[11] = uSrc2.au8[13];
12051 uDstOut.au8[12] = uSrc1.au8[14];
12052 uDstOut.au8[13] = uSrc2.au8[14];
12053 uDstOut.au8[14] = uSrc1.au8[15];
12054 uDstOut.au8[15] = uSrc2.au8[15];
12055 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12056 uDstOut.au8[16] = uSrc1.au8[24];
12057 uDstOut.au8[17] = uSrc2.au8[24];
12058 uDstOut.au8[18] = uSrc1.au8[25];
12059 uDstOut.au8[19] = uSrc2.au8[25];
12060 uDstOut.au8[20] = uSrc1.au8[26];
12061 uDstOut.au8[21] = uSrc2.au8[26];
12062 uDstOut.au8[22] = uSrc1.au8[27];
12063 uDstOut.au8[23] = uSrc2.au8[27];
12064 uDstOut.au8[24] = uSrc1.au8[28];
12065 uDstOut.au8[25] = uSrc2.au8[28];
12066 uDstOut.au8[26] = uSrc1.au8[29];
12067 uDstOut.au8[27] = uSrc2.au8[29];
12068 uDstOut.au8[28] = uSrc1.au8[30];
12069 uDstOut.au8[29] = uSrc2.au8[30];
12070 uDstOut.au8[30] = uSrc1.au8[31];
12071 uDstOut.au8[31] = uSrc2.au8[31];
12072 *puDst = uDstOut;
12073}
12074
12075
12076/*
12077 * PUNPCKHBW - high words -> dwords
12078 */
12079#ifdef IEM_WITHOUT_ASSEMBLY
12080
12081IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12082{
12083 RTUINT64U const uSrc2 = { *puSrc };
12084 RTUINT64U const uSrc1 = { *puDst };
12085 ASMCompilerBarrier();
12086 RTUINT64U uDstOut;
12087 uDstOut.au16[0] = uSrc1.au16[2];
12088 uDstOut.au16[1] = uSrc2.au16[2];
12089 uDstOut.au16[2] = uSrc1.au16[3];
12090 uDstOut.au16[3] = uSrc2.au16[3];
12091 *puDst = uDstOut.u;
12092}
12093
12094
12095IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12096{
12097 RTUINT128U const uSrc2 = *puSrc;
12098 RTUINT128U const uSrc1 = *puDst;
12099 ASMCompilerBarrier();
12100 RTUINT128U uDstOut;
12101 uDstOut.au16[0] = uSrc1.au16[4];
12102 uDstOut.au16[1] = uSrc2.au16[4];
12103 uDstOut.au16[2] = uSrc1.au16[5];
12104 uDstOut.au16[3] = uSrc2.au16[5];
12105 uDstOut.au16[4] = uSrc1.au16[6];
12106 uDstOut.au16[5] = uSrc2.au16[6];
12107 uDstOut.au16[6] = uSrc1.au16[7];
12108 uDstOut.au16[7] = uSrc2.au16[7];
12109 *puDst = uDstOut;
12110}
12111
12112#endif
12113
12114IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12115{
12116 RTUINT128U const uSrc2 = *puSrc2;
12117 RTUINT128U const uSrc1 = *puSrc1;
12118 ASMCompilerBarrier();
12119 RTUINT128U uDstOut;
12120 uDstOut.au16[0] = uSrc1.au16[4];
12121 uDstOut.au16[1] = uSrc2.au16[4];
12122 uDstOut.au16[2] = uSrc1.au16[5];
12123 uDstOut.au16[3] = uSrc2.au16[5];
12124 uDstOut.au16[4] = uSrc1.au16[6];
12125 uDstOut.au16[5] = uSrc2.au16[6];
12126 uDstOut.au16[6] = uSrc1.au16[7];
12127 uDstOut.au16[7] = uSrc2.au16[7];
12128 *puDst = uDstOut;
12129}
12130
12131
12132IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12133{
12134 RTUINT256U const uSrc2 = *puSrc2;
12135 RTUINT256U const uSrc1 = *puSrc1;
12136 ASMCompilerBarrier();
12137 RTUINT256U uDstOut;
12138 uDstOut.au16[0] = uSrc1.au16[4];
12139 uDstOut.au16[1] = uSrc2.au16[4];
12140 uDstOut.au16[2] = uSrc1.au16[5];
12141 uDstOut.au16[3] = uSrc2.au16[5];
12142 uDstOut.au16[4] = uSrc1.au16[6];
12143 uDstOut.au16[5] = uSrc2.au16[6];
12144 uDstOut.au16[6] = uSrc1.au16[7];
12145 uDstOut.au16[7] = uSrc2.au16[7];
12146
12147 uDstOut.au16[8] = uSrc1.au16[12];
12148 uDstOut.au16[9] = uSrc2.au16[12];
12149 uDstOut.au16[10] = uSrc1.au16[13];
12150 uDstOut.au16[11] = uSrc2.au16[13];
12151 uDstOut.au16[12] = uSrc1.au16[14];
12152 uDstOut.au16[13] = uSrc2.au16[14];
12153 uDstOut.au16[14] = uSrc1.au16[15];
12154 uDstOut.au16[15] = uSrc2.au16[15];
12155 *puDst = uDstOut;
12156}
12157
12158
12159/*
12160 * PUNPCKHBW - high dwords -> qword(s)
12161 */
12162#ifdef IEM_WITHOUT_ASSEMBLY
12163
12164IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12165{
12166 RTUINT64U const uSrc2 = { *puSrc };
12167 RTUINT64U const uSrc1 = { *puDst };
12168 ASMCompilerBarrier();
12169 RTUINT64U uDstOut;
12170 uDstOut.au32[0] = uSrc1.au32[1];
12171 uDstOut.au32[1] = uSrc2.au32[1];
12172 *puDst = uDstOut.u;
12173}
12174
12175
12176IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12177{
12178 RTUINT128U const uSrc2 = *puSrc;
12179 RTUINT128U const uSrc1 = *puDst;
12180 ASMCompilerBarrier();
12181 RTUINT128U uDstOut;
12182 uDstOut.au32[0] = uSrc1.au32[2];
12183 uDstOut.au32[1] = uSrc2.au32[2];
12184 uDstOut.au32[2] = uSrc1.au32[3];
12185 uDstOut.au32[3] = uSrc2.au32[3];
12186 *puDst = uDstOut;
12187}
12188
12189#endif
12190
12191IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12192{
12193 RTUINT128U const uSrc2 = *puSrc2;
12194 RTUINT128U const uSrc1 = *puSrc1;
12195 ASMCompilerBarrier();
12196 RTUINT128U uDstOut;
12197 uDstOut.au32[0] = uSrc1.au32[2];
12198 uDstOut.au32[1] = uSrc2.au32[2];
12199 uDstOut.au32[2] = uSrc1.au32[3];
12200 uDstOut.au32[3] = uSrc2.au32[3];
12201 *puDst = uDstOut;
12202}
12203
12204
12205IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12206{
12207 RTUINT256U const uSrc2 = *puSrc2;
12208 RTUINT256U const uSrc1 = *puSrc1;
12209 ASMCompilerBarrier();
12210 RTUINT256U uDstOut;
12211 uDstOut.au32[0] = uSrc1.au32[2];
12212 uDstOut.au32[1] = uSrc2.au32[2];
12213 uDstOut.au32[2] = uSrc1.au32[3];
12214 uDstOut.au32[3] = uSrc2.au32[3];
12215
12216 uDstOut.au32[4] = uSrc1.au32[6];
12217 uDstOut.au32[5] = uSrc2.au32[6];
12218 uDstOut.au32[6] = uSrc1.au32[7];
12219 uDstOut.au32[7] = uSrc2.au32[7];
12220 *puDst = uDstOut;
12221}
12222
12223
12224/*
12225 * PUNPCKHQDQ -> High qwords -> double qword(s).
12226 */
12227#ifdef IEM_WITHOUT_ASSEMBLY
12228IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12229{
12230 RTUINT128U const uSrc2 = *puSrc;
12231 RTUINT128U const uSrc1 = *puDst;
12232 ASMCompilerBarrier();
12233 RTUINT128U uDstOut;
12234 uDstOut.au64[0] = uSrc1.au64[1];
12235 uDstOut.au64[1] = uSrc2.au64[1];
12236 *puDst = uDstOut;
12237}
12238#endif
12239
12240
12241IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12242{
12243 RTUINT128U const uSrc2 = *puSrc2;
12244 RTUINT128U const uSrc1 = *puSrc1;
12245 ASMCompilerBarrier();
12246 RTUINT128U uDstOut;
12247 uDstOut.au64[0] = uSrc1.au64[1];
12248 uDstOut.au64[1] = uSrc2.au64[1];
12249 *puDst = uDstOut;
12250}
12251
12252
12253IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12254{
12255 RTUINT256U const uSrc2 = *puSrc2;
12256 RTUINT256U const uSrc1 = *puSrc1;
12257 ASMCompilerBarrier();
12258 RTUINT256U uDstOut;
12259 uDstOut.au64[0] = uSrc1.au64[1];
12260 uDstOut.au64[1] = uSrc2.au64[1];
12261
12262 uDstOut.au64[2] = uSrc1.au64[3];
12263 uDstOut.au64[3] = uSrc2.au64[3];
12264 *puDst = uDstOut;
12265}
12266
12267
12268/*
12269 * PUNPCKLBW - low bytes -> words
12270 */
12271#ifdef IEM_WITHOUT_ASSEMBLY
12272
12273IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12274{
12275 RTUINT64U const uSrc2 = { *puSrc };
12276 RTUINT64U const uSrc1 = { *puDst };
12277 ASMCompilerBarrier();
12278 RTUINT64U uDstOut;
12279 uDstOut.au8[0] = uSrc1.au8[0];
12280 uDstOut.au8[1] = uSrc2.au8[0];
12281 uDstOut.au8[2] = uSrc1.au8[1];
12282 uDstOut.au8[3] = uSrc2.au8[1];
12283 uDstOut.au8[4] = uSrc1.au8[2];
12284 uDstOut.au8[5] = uSrc2.au8[2];
12285 uDstOut.au8[6] = uSrc1.au8[3];
12286 uDstOut.au8[7] = uSrc2.au8[3];
12287 *puDst = uDstOut.u;
12288}
12289
12290
12291IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12292{
12293 RTUINT128U const uSrc2 = *puSrc;
12294 RTUINT128U const uSrc1 = *puDst;
12295 ASMCompilerBarrier();
12296 RTUINT128U uDstOut;
12297 uDstOut.au8[ 0] = uSrc1.au8[0];
12298 uDstOut.au8[ 1] = uSrc2.au8[0];
12299 uDstOut.au8[ 2] = uSrc1.au8[1];
12300 uDstOut.au8[ 3] = uSrc2.au8[1];
12301 uDstOut.au8[ 4] = uSrc1.au8[2];
12302 uDstOut.au8[ 5] = uSrc2.au8[2];
12303 uDstOut.au8[ 6] = uSrc1.au8[3];
12304 uDstOut.au8[ 7] = uSrc2.au8[3];
12305 uDstOut.au8[ 8] = uSrc1.au8[4];
12306 uDstOut.au8[ 9] = uSrc2.au8[4];
12307 uDstOut.au8[10] = uSrc1.au8[5];
12308 uDstOut.au8[11] = uSrc2.au8[5];
12309 uDstOut.au8[12] = uSrc1.au8[6];
12310 uDstOut.au8[13] = uSrc2.au8[6];
12311 uDstOut.au8[14] = uSrc1.au8[7];
12312 uDstOut.au8[15] = uSrc2.au8[7];
12313 *puDst = uDstOut;
12314}
12315
12316#endif
12317
12318IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12319{
12320 RTUINT128U const uSrc2 = *puSrc2;
12321 RTUINT128U const uSrc1 = *puSrc1;
12322 ASMCompilerBarrier();
12323 RTUINT128U uDstOut;
12324 uDstOut.au8[ 0] = uSrc1.au8[0];
12325 uDstOut.au8[ 1] = uSrc2.au8[0];
12326 uDstOut.au8[ 2] = uSrc1.au8[1];
12327 uDstOut.au8[ 3] = uSrc2.au8[1];
12328 uDstOut.au8[ 4] = uSrc1.au8[2];
12329 uDstOut.au8[ 5] = uSrc2.au8[2];
12330 uDstOut.au8[ 6] = uSrc1.au8[3];
12331 uDstOut.au8[ 7] = uSrc2.au8[3];
12332 uDstOut.au8[ 8] = uSrc1.au8[4];
12333 uDstOut.au8[ 9] = uSrc2.au8[4];
12334 uDstOut.au8[10] = uSrc1.au8[5];
12335 uDstOut.au8[11] = uSrc2.au8[5];
12336 uDstOut.au8[12] = uSrc1.au8[6];
12337 uDstOut.au8[13] = uSrc2.au8[6];
12338 uDstOut.au8[14] = uSrc1.au8[7];
12339 uDstOut.au8[15] = uSrc2.au8[7];
12340 *puDst = uDstOut;
12341}
12342
12343
12344IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12345{
12346 RTUINT256U const uSrc2 = *puSrc2;
12347 RTUINT256U const uSrc1 = *puSrc1;
12348 ASMCompilerBarrier();
12349 RTUINT256U uDstOut;
12350 uDstOut.au8[ 0] = uSrc1.au8[0];
12351 uDstOut.au8[ 1] = uSrc2.au8[0];
12352 uDstOut.au8[ 2] = uSrc1.au8[1];
12353 uDstOut.au8[ 3] = uSrc2.au8[1];
12354 uDstOut.au8[ 4] = uSrc1.au8[2];
12355 uDstOut.au8[ 5] = uSrc2.au8[2];
12356 uDstOut.au8[ 6] = uSrc1.au8[3];
12357 uDstOut.au8[ 7] = uSrc2.au8[3];
12358 uDstOut.au8[ 8] = uSrc1.au8[4];
12359 uDstOut.au8[ 9] = uSrc2.au8[4];
12360 uDstOut.au8[10] = uSrc1.au8[5];
12361 uDstOut.au8[11] = uSrc2.au8[5];
12362 uDstOut.au8[12] = uSrc1.au8[6];
12363 uDstOut.au8[13] = uSrc2.au8[6];
12364 uDstOut.au8[14] = uSrc1.au8[7];
12365 uDstOut.au8[15] = uSrc2.au8[7];
12366 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12367 uDstOut.au8[16] = uSrc1.au8[16];
12368 uDstOut.au8[17] = uSrc2.au8[16];
12369 uDstOut.au8[18] = uSrc1.au8[17];
12370 uDstOut.au8[19] = uSrc2.au8[17];
12371 uDstOut.au8[20] = uSrc1.au8[18];
12372 uDstOut.au8[21] = uSrc2.au8[18];
12373 uDstOut.au8[22] = uSrc1.au8[19];
12374 uDstOut.au8[23] = uSrc2.au8[19];
12375 uDstOut.au8[24] = uSrc1.au8[20];
12376 uDstOut.au8[25] = uSrc2.au8[20];
12377 uDstOut.au8[26] = uSrc1.au8[21];
12378 uDstOut.au8[27] = uSrc2.au8[21];
12379 uDstOut.au8[28] = uSrc1.au8[22];
12380 uDstOut.au8[29] = uSrc2.au8[22];
12381 uDstOut.au8[30] = uSrc1.au8[23];
12382 uDstOut.au8[31] = uSrc2.au8[23];
12383 *puDst = uDstOut;
12384}
12385
12386
12387/*
12388 * PUNPCKLBW - low words -> dwords
12389 */
12390#ifdef IEM_WITHOUT_ASSEMBLY
12391
12392IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12393{
12394 RTUINT64U const uSrc2 = { *puSrc };
12395 RTUINT64U const uSrc1 = { *puDst };
12396 ASMCompilerBarrier();
12397 RTUINT64U uDstOut;
12398 uDstOut.au16[0] = uSrc1.au16[0];
12399 uDstOut.au16[1] = uSrc2.au16[0];
12400 uDstOut.au16[2] = uSrc1.au16[1];
12401 uDstOut.au16[3] = uSrc2.au16[1];
12402 *puDst = uDstOut.u;
12403}
12404
12405
12406IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12407{
12408 RTUINT128U const uSrc2 = *puSrc;
12409 RTUINT128U const uSrc1 = *puDst;
12410 ASMCompilerBarrier();
12411 RTUINT128U uDstOut;
12412 uDstOut.au16[0] = uSrc1.au16[0];
12413 uDstOut.au16[1] = uSrc2.au16[0];
12414 uDstOut.au16[2] = uSrc1.au16[1];
12415 uDstOut.au16[3] = uSrc2.au16[1];
12416 uDstOut.au16[4] = uSrc1.au16[2];
12417 uDstOut.au16[5] = uSrc2.au16[2];
12418 uDstOut.au16[6] = uSrc1.au16[3];
12419 uDstOut.au16[7] = uSrc2.au16[3];
12420 *puDst = uDstOut;
12421}
12422
12423#endif
12424
12425IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12426{
12427 RTUINT128U const uSrc2 = *puSrc2;
12428 RTUINT128U const uSrc1 = *puSrc1;
12429 ASMCompilerBarrier();
12430 RTUINT128U uDstOut;
12431 uDstOut.au16[0] = uSrc1.au16[0];
12432 uDstOut.au16[1] = uSrc2.au16[0];
12433 uDstOut.au16[2] = uSrc1.au16[1];
12434 uDstOut.au16[3] = uSrc2.au16[1];
12435 uDstOut.au16[4] = uSrc1.au16[2];
12436 uDstOut.au16[5] = uSrc2.au16[2];
12437 uDstOut.au16[6] = uSrc1.au16[3];
12438 uDstOut.au16[7] = uSrc2.au16[3];
12439 *puDst = uDstOut;
12440}
12441
12442
12443IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12444{
12445 RTUINT256U const uSrc2 = *puSrc2;
12446 RTUINT256U const uSrc1 = *puSrc1;
12447 ASMCompilerBarrier();
12448 RTUINT256U uDstOut;
12449 uDstOut.au16[0] = uSrc1.au16[0];
12450 uDstOut.au16[1] = uSrc2.au16[0];
12451 uDstOut.au16[2] = uSrc1.au16[1];
12452 uDstOut.au16[3] = uSrc2.au16[1];
12453 uDstOut.au16[4] = uSrc1.au16[2];
12454 uDstOut.au16[5] = uSrc2.au16[2];
12455 uDstOut.au16[6] = uSrc1.au16[3];
12456 uDstOut.au16[7] = uSrc2.au16[3];
12457
12458 uDstOut.au16[8] = uSrc1.au16[8];
12459 uDstOut.au16[9] = uSrc2.au16[8];
12460 uDstOut.au16[10] = uSrc1.au16[9];
12461 uDstOut.au16[11] = uSrc2.au16[9];
12462 uDstOut.au16[12] = uSrc1.au16[10];
12463 uDstOut.au16[13] = uSrc2.au16[10];
12464 uDstOut.au16[14] = uSrc1.au16[11];
12465 uDstOut.au16[15] = uSrc2.au16[11];
12466 *puDst = uDstOut;
12467}
12468
12469
12470/*
12471 * PUNPCKLBW - low dwords -> qword(s)
12472 */
12473#ifdef IEM_WITHOUT_ASSEMBLY
12474
12475IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12476{
12477 RTUINT64U const uSrc2 = { *puSrc };
12478 RTUINT64U const uSrc1 = { *puDst };
12479 ASMCompilerBarrier();
12480 RTUINT64U uDstOut;
12481 uDstOut.au32[0] = uSrc1.au32[0];
12482 uDstOut.au32[1] = uSrc2.au32[0];
12483 *puDst = uDstOut.u;
12484}
12485
12486
12487IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12488{
12489 RTUINT128U const uSrc2 = *puSrc;
12490 RTUINT128U const uSrc1 = *puDst;
12491 ASMCompilerBarrier();
12492 RTUINT128U uDstOut;
12493 uDstOut.au32[0] = uSrc1.au32[0];
12494 uDstOut.au32[1] = uSrc2.au32[0];
12495 uDstOut.au32[2] = uSrc1.au32[1];
12496 uDstOut.au32[3] = uSrc2.au32[1];
12497 *puDst = uDstOut;
12498}
12499
12500#endif
12501
12502IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12503{
12504 RTUINT128U const uSrc2 = *puSrc2;
12505 RTUINT128U const uSrc1 = *puSrc1;
12506 ASMCompilerBarrier();
12507 RTUINT128U uDstOut;
12508 uDstOut.au32[0] = uSrc1.au32[0];
12509 uDstOut.au32[1] = uSrc2.au32[0];
12510 uDstOut.au32[2] = uSrc1.au32[1];
12511 uDstOut.au32[3] = uSrc2.au32[1];
12512 *puDst = uDstOut;
12513}
12514
12515
12516IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12517{
12518 RTUINT256U const uSrc2 = *puSrc2;
12519 RTUINT256U const uSrc1 = *puSrc1;
12520 ASMCompilerBarrier();
12521 RTUINT256U uDstOut;
12522 uDstOut.au32[0] = uSrc1.au32[0];
12523 uDstOut.au32[1] = uSrc2.au32[0];
12524 uDstOut.au32[2] = uSrc1.au32[1];
12525 uDstOut.au32[3] = uSrc2.au32[1];
12526
12527 uDstOut.au32[4] = uSrc1.au32[4];
12528 uDstOut.au32[5] = uSrc2.au32[4];
12529 uDstOut.au32[6] = uSrc1.au32[5];
12530 uDstOut.au32[7] = uSrc2.au32[5];
12531 *puDst = uDstOut;
12532}
12533
12534
12535/*
12536 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12537 */
12538#ifdef IEM_WITHOUT_ASSEMBLY
12539IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12540{
12541 RTUINT128U const uSrc2 = *puSrc;
12542 RTUINT128U const uSrc1 = *puDst;
12543 ASMCompilerBarrier();
12544 RTUINT128U uDstOut;
12545 uDstOut.au64[0] = uSrc1.au64[0];
12546 uDstOut.au64[1] = uSrc2.au64[0];
12547 *puDst = uDstOut;
12548}
12549#endif
12550
12551
12552IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12553{
12554 RTUINT128U const uSrc2 = *puSrc2;
12555 RTUINT128U const uSrc1 = *puSrc1;
12556 ASMCompilerBarrier();
12557 RTUINT128U uDstOut;
12558 uDstOut.au64[0] = uSrc1.au64[0];
12559 uDstOut.au64[1] = uSrc2.au64[0];
12560 *puDst = uDstOut;
12561}
12562
12563
12564IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12565{
12566 RTUINT256U const uSrc2 = *puSrc2;
12567 RTUINT256U const uSrc1 = *puSrc1;
12568 ASMCompilerBarrier();
12569 RTUINT256U uDstOut;
12570 uDstOut.au64[0] = uSrc1.au64[0];
12571 uDstOut.au64[1] = uSrc2.au64[0];
12572
12573 uDstOut.au64[2] = uSrc1.au64[2];
12574 uDstOut.au64[3] = uSrc2.au64[2];
12575 *puDst = uDstOut;
12576}
12577
12578
12579/*
12580 * PACKSSWB - signed words -> signed bytes
12581 */
12582
12583#ifdef IEM_WITHOUT_ASSEMBLY
12584
12585IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12586{
12587 RTUINT64U const uSrc2 = { *puSrc };
12588 RTUINT64U const uSrc1 = { *puDst };
12589 ASMCompilerBarrier();
12590 RTUINT64U uDstOut;
12591 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12592 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12593 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12594 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12595 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12596 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12597 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12598 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12599 *puDst = uDstOut.u;
12600}
12601
12602
12603IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12604{
12605 RTUINT128U const uSrc2 = *puSrc;
12606 RTUINT128U const uSrc1 = *puDst;
12607 ASMCompilerBarrier();
12608 RTUINT128U uDstOut;
12609 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12610 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12611 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12612 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12613 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12614 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12615 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12616 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12617 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12618 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12619 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12620 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12621 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12622 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12623 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12624 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12625 *puDst = uDstOut;
12626}
12627
12628#endif
12629
12630IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12631{
12632 RTUINT128U const uSrc2 = *puSrc2;
12633 RTUINT128U const uSrc1 = *puSrc1;
12634 ASMCompilerBarrier();
12635 RTUINT128U uDstOut;
12636 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12637 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12638 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12639 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12640 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12641 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12642 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12643 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12644 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12645 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12646 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12647 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12648 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12649 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12650 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12651 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12652 *puDst = uDstOut;
12653}
12654
12655
12656IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12657{
12658 RTUINT256U const uSrc2 = *puSrc2;
12659 RTUINT256U const uSrc1 = *puSrc1;
12660 ASMCompilerBarrier();
12661 RTUINT256U uDstOut;
12662 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12663 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12664 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12665 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12666 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12667 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12668 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12669 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12670 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12671 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12672 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12673 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12674 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12675 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12676 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12677 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12678
12679 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12680 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12681 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12682 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12683 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12684 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12685 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12686 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12687 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12688 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12689 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12690 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12691 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12692 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12693 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12694 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12695 *puDst = uDstOut;
12696}
12697
12698
12699/*
12700 * PACKUSWB - signed words -> unsigned bytes
12701 */
12702#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12703 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12704 ? (uint8_t)(a_iWord) \
12705 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12706
12707#ifdef IEM_WITHOUT_ASSEMBLY
12708
12709IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12710{
12711 RTUINT64U const uSrc2 = { *puSrc };
12712 RTUINT64U const uSrc1 = { *puDst };
12713 ASMCompilerBarrier();
12714 RTUINT64U uDstOut;
12715 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12716 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12717 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12718 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12719 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12720 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12721 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12722 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12723 *puDst = uDstOut.u;
12724}
12725
12726
12727IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12728{
12729 RTUINT128U const uSrc2 = *puSrc;
12730 RTUINT128U const uSrc1 = *puDst;
12731 ASMCompilerBarrier();
12732 RTUINT128U uDstOut;
12733 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12734 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12735 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12736 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12737 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12738 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12739 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12740 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12741 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12742 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12743 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12744 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12745 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12746 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12747 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12748 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12749 *puDst = uDstOut;
12750}
12751
12752#endif
12753
12754IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12755{
12756 RTUINT128U const uSrc2 = *puSrc2;
12757 RTUINT128U const uSrc1 = *puSrc1;
12758 ASMCompilerBarrier();
12759 RTUINT128U uDstOut;
12760 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12761 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12762 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12763 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12764 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12765 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12766 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12767 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12768 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12769 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12770 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12771 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12772 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12773 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12774 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12775 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12776 *puDst = uDstOut;
12777}
12778
12779
12780IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12781{
12782 RTUINT256U const uSrc2 = *puSrc2;
12783 RTUINT256U const uSrc1 = *puSrc1;
12784 ASMCompilerBarrier();
12785 RTUINT256U uDstOut;
12786 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12787 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12788 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12789 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12790 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12791 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12792 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12793 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12794 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12795 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12796 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12797 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12798 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12799 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12800 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12801 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12802
12803 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12804 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12805 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12806 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12807 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12808 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12809 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12810 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12811 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12812 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12813 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12814 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12815 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12816 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12817 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12818 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12819 *puDst = uDstOut;
12820}
12821
12822
12823/*
12824 * PACKSSDW - signed dwords -> signed words
12825 */
12826
12827#ifdef IEM_WITHOUT_ASSEMBLY
12828
12829IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12830{
12831 RTUINT64U const uSrc2 = { *puSrc };
12832 RTUINT64U const uSrc1 = { *puDst };
12833 ASMCompilerBarrier();
12834 RTUINT64U uDstOut;
12835 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12836 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12837 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12838 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12839 *puDst = uDstOut.u;
12840}
12841
12842
12843IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12844{
12845 RTUINT128U const uSrc2 = *puSrc;
12846 RTUINT128U const uSrc1 = *puDst;
12847 ASMCompilerBarrier();
12848 RTUINT128U uDstOut;
12849 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12850 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12851 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12852 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12853 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12854 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12855 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12856 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12857 *puDst = uDstOut;
12858}
12859
12860#endif
12861
12862IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12863{
12864 RTUINT128U const uSrc2 = *puSrc2;
12865 RTUINT128U const uSrc1 = *puSrc1;
12866 ASMCompilerBarrier();
12867 RTUINT128U uDstOut;
12868 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12869 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12870 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12871 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12872 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12873 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12874 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12875 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12876 *puDst = uDstOut;
12877}
12878
12879
12880IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12881{
12882 RTUINT256U const uSrc2 = *puSrc2;
12883 RTUINT256U const uSrc1 = *puSrc1;
12884 ASMCompilerBarrier();
12885 RTUINT256U uDstOut;
12886 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12887 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12888 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12889 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12890 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12891 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12892 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12893 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12894
12895 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12896 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12897 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12898 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12899 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12900 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12901 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12902 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12903 *puDst = uDstOut;
12904}
12905
12906
12907/*
12908 * PACKUSDW - signed dwords -> unsigned words
12909 */
12910#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12911 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12912 ? (uint16_t)(a_iDword) \
12913 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12914
12915#ifdef IEM_WITHOUT_ASSEMBLY
12916IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12917{
12918 RTUINT128U const uSrc2 = *puSrc;
12919 RTUINT128U const uSrc1 = *puDst;
12920 ASMCompilerBarrier();
12921 RTUINT128U uDstOut;
12922 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12923 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12924 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12925 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12926 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12927 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12928 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12929 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12930 *puDst = uDstOut;
12931}
12932#endif
12933
12934IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12935{
12936 RTUINT128U const uSrc2 = *puSrc2;
12937 RTUINT128U const uSrc1 = *puSrc1;
12938 ASMCompilerBarrier();
12939 RTUINT128U uDstOut;
12940 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12941 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12942 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12943 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12944 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12945 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12946 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12947 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12948 *puDst = uDstOut;
12949}
12950
12951
12952IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12953{
12954 RTUINT256U const uSrc2 = *puSrc2;
12955 RTUINT256U const uSrc1 = *puSrc1;
12956 ASMCompilerBarrier();
12957 RTUINT256U uDstOut;
12958 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12959 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12960 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12961 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12962 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12963 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12964 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12965 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12966
12967 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12968 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12969 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12970 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12971 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12972 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12973 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12974 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12975 *puDst = uDstOut;
12976}
12977
12978
12979/*
12980 * [V]PABSB / [V]PABSW / [V]PABSD
12981 */
12982
12983IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12984{
12985 RTUINT64U const uSrc = { *puSrc };
12986 RTUINT64U uDstOut = { 0 };
12987
12988 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12989 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12990 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12991 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12992 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12993 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12994 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12995 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12996 *puDst = uDstOut.u;
12997 RT_NOREF(pFpuState);
12998}
12999
13000
13001IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13002{
13003 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13004 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13005 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13006 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13007 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13008 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13009 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13010 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13011 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13012 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13013 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13014 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13015 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13016 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13017 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13018 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13019 RT_NOREF(pFpuState);
13020}
13021
13022
13023IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13024{
13025 RTUINT64U const uSrc = { *puSrc };
13026 RTUINT64U uDstOut = { 0 };
13027
13028 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13029 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13030 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13031 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13032 *puDst = uDstOut.u;
13033 RT_NOREF(pFpuState);
13034}
13035
13036
13037IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13038{
13039 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13040 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13041 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13042 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13043 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13044 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13045 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13046 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13047 RT_NOREF(pFpuState);
13048}
13049
13050
13051IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13052{
13053 RTUINT64U const uSrc = { *puSrc };
13054 RTUINT64U uDstOut = { 0 };
13055
13056 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13057 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13058 *puDst = uDstOut.u;
13059 RT_NOREF(pFpuState);
13060}
13061
13062
13063IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13064{
13065 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13066 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13067 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13068 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13069 RT_NOREF(pFpuState);
13070}
13071
13072
13073IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13074{
13075 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13076 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13077 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13078 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13079 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13080 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13081 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13082 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13083 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13084 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13085 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13086 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13087 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13088 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13089 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13090 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13091}
13092
13093
13094IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13095{
13096 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13097 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13098 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13099 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13100 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13101 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13102 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13103 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13104 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13105 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13106 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13107 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13108 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13109 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13110 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13111 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13112 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13113 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13114 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13115 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13116 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13117 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13118 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13119 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13120 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13121 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13122 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13123 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13124 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13125 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13126 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13127 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13128}
13129
13130
13131IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13132{
13133 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13134 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13135 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13136 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13137 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13138 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13139 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13140 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13141}
13142
13143
13144IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13145{
13146 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13147 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13148 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13149 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13150 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13151 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13152 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13153 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13154 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13155 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13156 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13157 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13158 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13159 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13160 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13161 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13162}
13163
13164
13165IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13166{
13167 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13168 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13169 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13170 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13171}
13172
13173
13174IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13175{
13176 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13177 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13178 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13179 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13180 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13181 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13182 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13183 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13184}
13185
13186
13187/*
13188 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13189 */
13190IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13191{
13192 RTUINT64U uSrc1 = { *puDst };
13193 RTUINT64U uSrc2 = { *puSrc };
13194 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13195
13196 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13197 {
13198 if (uSrc2.ai8[i] < 0)
13199 uDst.ai8[i] = -uSrc1.ai8[i];
13200 else if (uSrc2.ai8[i] == 0)
13201 uDst.ai8[i] = 0;
13202 else /* uSrc2.ai8[i] > 0 */
13203 uDst.ai8[i] = uSrc1.ai8[i];
13204 }
13205
13206 *puDst = uDst.u;
13207 RT_NOREF(pFpuState);
13208}
13209
13210
13211IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13212{
13213 RTUINT128U uSrc1 = *puDst;
13214
13215 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13216 {
13217 if (puSrc->ai8[i] < 0)
13218 puDst->ai8[i] = -uSrc1.ai8[i];
13219 else if (puSrc->ai8[i] == 0)
13220 puDst->ai8[i] = 0;
13221 else /* puSrc->ai8[i] > 0 */
13222 puDst->ai8[i] = uSrc1.ai8[i];
13223 }
13224
13225 RT_NOREF(pFpuState);
13226}
13227
13228
13229IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13230{
13231 RTUINT64U uSrc1 = { *puDst };
13232 RTUINT64U uSrc2 = { *puSrc };
13233 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13234
13235 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13236 {
13237 if (uSrc2.ai16[i] < 0)
13238 uDst.ai16[i] = -uSrc1.ai16[i];
13239 else if (uSrc2.ai16[i] == 0)
13240 uDst.ai16[i] = 0;
13241 else /* uSrc2.ai16[i] > 0 */
13242 uDst.ai16[i] = uSrc1.ai16[i];
13243 }
13244
13245 *puDst = uDst.u;
13246 RT_NOREF(pFpuState);
13247}
13248
13249
13250IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13251{
13252 RTUINT128U uSrc1 = *puDst;
13253
13254 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13255 {
13256 if (puSrc->ai16[i] < 0)
13257 puDst->ai16[i] = -uSrc1.ai16[i];
13258 else if (puSrc->ai16[i] == 0)
13259 puDst->ai16[i] = 0;
13260 else /* puSrc->ai16[i] > 0 */
13261 puDst->ai16[i] = uSrc1.ai16[i];
13262 }
13263
13264 RT_NOREF(pFpuState);
13265}
13266
13267
13268IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13269{
13270 RTUINT64U uSrc1 = { *puDst };
13271 RTUINT64U uSrc2 = { *puSrc };
13272 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13273
13274 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13275 {
13276 if (uSrc2.ai32[i] < 0)
13277 uDst.ai32[i] = -uSrc1.ai32[i];
13278 else if (uSrc2.ai32[i] == 0)
13279 uDst.ai32[i] = 0;
13280 else /* uSrc2.ai32[i] > 0 */
13281 uDst.ai32[i] = uSrc1.ai32[i];
13282 }
13283
13284 *puDst = uDst.u;
13285 RT_NOREF(pFpuState);
13286}
13287
13288
13289IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13290{
13291 RTUINT128U uSrc1 = *puDst;
13292
13293 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13294 {
13295 if (puSrc->ai32[i] < 0)
13296 puDst->ai32[i] = -uSrc1.ai32[i];
13297 else if (puSrc->ai32[i] == 0)
13298 puDst->ai32[i] = 0;
13299 else /* puSrc->ai32[i] > 0 */
13300 puDst->ai32[i] = uSrc1.ai32[i];
13301 }
13302
13303 RT_NOREF(pFpuState);
13304}
13305
13306
13307IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13308{
13309 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13310 {
13311 if (puSrc2->ai8[i] < 0)
13312 puDst->ai8[i] = -puSrc1->ai8[i];
13313 else if (puSrc2->ai8[i] == 0)
13314 puDst->ai8[i] = 0;
13315 else /* puSrc2->ai8[i] > 0 */
13316 puDst->ai8[i] = puSrc1->ai8[i];
13317 }
13318}
13319
13320
13321IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13322{
13323 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13324 {
13325 if (puSrc2->ai8[i] < 0)
13326 puDst->ai8[i] = -puSrc1->ai8[i];
13327 else if (puSrc2->ai8[i] == 0)
13328 puDst->ai8[i] = 0;
13329 else /* puSrc2->ai8[i] > 0 */
13330 puDst->ai8[i] = puSrc1->ai8[i];
13331 }
13332}
13333
13334
13335IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13336{
13337 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13338 {
13339 if (puSrc2->ai16[i] < 0)
13340 puDst->ai16[i] = -puSrc1->ai16[i];
13341 else if (puSrc2->ai16[i] == 0)
13342 puDst->ai16[i] = 0;
13343 else /* puSrc2->ai16[i] > 0 */
13344 puDst->ai16[i] = puSrc1->ai16[i];
13345 }
13346}
13347
13348
13349IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13350{
13351 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13352 {
13353 if (puSrc2->ai16[i] < 0)
13354 puDst->ai16[i] = -puSrc1->ai16[i];
13355 else if (puSrc2->ai16[i] == 0)
13356 puDst->ai16[i] = 0;
13357 else /* puSrc2->ai16[i] > 0 */
13358 puDst->ai16[i] = puSrc1->ai16[i];
13359 }
13360}
13361
13362
13363IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13364{
13365 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13366 {
13367 if (puSrc2->ai32[i] < 0)
13368 puDst->ai32[i] = -puSrc1->ai32[i];
13369 else if (puSrc2->ai32[i] == 0)
13370 puDst->ai32[i] = 0;
13371 else /* puSrc2->ai32[i] > 0 */
13372 puDst->ai32[i] = puSrc1->ai32[i];
13373 }
13374}
13375
13376
13377IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13378{
13379 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13380 {
13381 if (puSrc2->ai32[i] < 0)
13382 puDst->ai32[i] = -puSrc1->ai32[i];
13383 else if (puSrc2->ai32[i] == 0)
13384 puDst->ai32[i] = 0;
13385 else /* puSrc2->ai32[i] > 0 */
13386 puDst->ai32[i] = puSrc1->ai32[i];
13387 }
13388}
13389
13390
13391/*
13392 * PHADDW / VPHADDW / PHADDD / VPHADDD
13393 */
13394IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13395{
13396 RTUINT64U uSrc1 = { *puDst };
13397 RTUINT64U uSrc2 = { *puSrc };
13398 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13399
13400 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13401 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13402 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13403 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13404 *puDst = uDst.u;
13405 RT_NOREF(pFpuState);
13406}
13407
13408
13409IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13410{
13411 RTUINT128U uSrc1 = *puDst;
13412
13413 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13414 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13415 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13416 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13417
13418 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13419 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13420 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13421 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13422 RT_NOREF(pFpuState);
13423}
13424
13425
13426IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13427{
13428 RTUINT64U uSrc1 = { *puDst };
13429 RTUINT64U uSrc2 = { *puSrc };
13430 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13431
13432 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13433 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
13434 *puDst = uDst.u;
13435 RT_NOREF(pFpuState);
13436}
13437
13438
13439IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13440{
13441 RTUINT128U uSrc1 = *puDst;
13442
13443 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13444 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13445
13446 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13447 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13448 RT_NOREF(pFpuState);
13449}
13450
13451
13452IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13453{
13454 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13455
13456 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13457 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13458 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13459 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13460
13461 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13462 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13463 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13464 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13465
13466 puDst->au64[0] = uDst.au64[0];
13467 puDst->au64[1] = uDst.au64[1];
13468}
13469
13470
13471IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13472{
13473 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13474
13475 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13476 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13477 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13478 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13479 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13480 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13481 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13482 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13483
13484 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13485 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13486 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13487 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13488 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13489 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13490 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13491 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13492
13493 puDst->au64[0] = uDst.au64[0];
13494 puDst->au64[1] = uDst.au64[1];
13495 puDst->au64[2] = uDst.au64[2];
13496 puDst->au64[3] = uDst.au64[3];
13497}
13498
13499
13500IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13501{
13502 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13503
13504 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13505 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13506
13507 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13508 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13509
13510 puDst->au64[0] = uDst.au64[0];
13511 puDst->au64[1] = uDst.au64[1];
13512}
13513
13514
13515IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13516{
13517 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13518
13519 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13520 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13521 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13522 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13523
13524 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13525 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13526 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13527 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13528
13529 puDst->au64[0] = uDst.au64[0];
13530 puDst->au64[1] = uDst.au64[1];
13531 puDst->au64[2] = uDst.au64[2];
13532 puDst->au64[3] = uDst.au64[3];
13533}
13534
13535
13536/*
13537 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13538 */
13539IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13540{
13541 RTUINT64U uSrc1 = { *puDst };
13542 RTUINT64U uSrc2 = { *puSrc };
13543 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13544
13545 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13546 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13547 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13548 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13549 *puDst = uDst.u;
13550 RT_NOREF(pFpuState);
13551}
13552
13553
13554IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13555{
13556 RTUINT128U uSrc1 = *puDst;
13557
13558 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13559 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13560 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13561 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13562
13563 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13564 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13565 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13566 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13567 RT_NOREF(pFpuState);
13568}
13569
13570
13571IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13572{
13573 RTUINT64U uSrc1 = { *puDst };
13574 RTUINT64U uSrc2 = { *puSrc };
13575 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13576
13577 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13578 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13579 *puDst = uDst.u;
13580 RT_NOREF(pFpuState);
13581}
13582
13583
13584IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13585{
13586 RTUINT128U uSrc1 = *puDst;
13587
13588 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13589 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13590
13591 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13592 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13593 RT_NOREF(pFpuState);
13594}
13595
13596
13597IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13598{
13599 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13600
13601 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13602 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13603 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13604 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13605
13606 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13607 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13608 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13609 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13610
13611 puDst->au64[0] = uDst.au64[0];
13612 puDst->au64[1] = uDst.au64[1];
13613}
13614
13615
13616IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13617{
13618 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13619
13620 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13621 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13622 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13623 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13624 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13625 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13626 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13627 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13628
13629 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13630 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13631 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13632 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13633 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13634 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13635 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13636 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13637
13638 puDst->au64[0] = uDst.au64[0];
13639 puDst->au64[1] = uDst.au64[1];
13640 puDst->au64[2] = uDst.au64[2];
13641 puDst->au64[3] = uDst.au64[3];
13642}
13643
13644
13645IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13646{
13647 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13648
13649 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13650 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13651
13652 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13653 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13654
13655 puDst->au64[0] = uDst.au64[0];
13656 puDst->au64[1] = uDst.au64[1];
13657}
13658
13659
13660IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13661{
13662 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13663
13664 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13665 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13666 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13667 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13668
13669 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13670 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13671 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13672 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13673
13674 puDst->au64[0] = uDst.au64[0];
13675 puDst->au64[1] = uDst.au64[1];
13676 puDst->au64[2] = uDst.au64[2];
13677 puDst->au64[3] = uDst.au64[3];
13678}
13679
13680
13681/*
13682 * PHADDSW / VPHADDSW
13683 */
13684IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13685{
13686 RTUINT64U uSrc1 = { *puDst };
13687 RTUINT64U uSrc2 = { *puSrc };
13688 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13689
13690 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13691 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13692 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13693 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13694 *puDst = uDst.u;
13695 RT_NOREF(pFpuState);
13696}
13697
13698
13699IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13700{
13701 RTUINT128U uSrc1 = *puDst;
13702
13703 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13704 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13705 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13706 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13707
13708 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13709 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13710 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13711 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13712 RT_NOREF(pFpuState);
13713}
13714
13715
13716IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13717{
13718 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13719
13720 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13721 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13722 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13723 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13724
13725 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13726 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13727 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13728 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13729
13730 puDst->au64[0] = uDst.au64[0];
13731 puDst->au64[1] = uDst.au64[1];
13732}
13733
13734
13735IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13736{
13737 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13738
13739 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13740 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13741 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13742 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13743 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13744 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13745 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13746 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13747
13748 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13749 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13750 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13751 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13752 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13753 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13754 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13755 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13756
13757 puDst->au64[0] = uDst.au64[0];
13758 puDst->au64[1] = uDst.au64[1];
13759 puDst->au64[2] = uDst.au64[2];
13760 puDst->au64[3] = uDst.au64[3];
13761}
13762
13763
13764/*
13765 * PHSUBSW / VPHSUBSW
13766 */
13767IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13768{
13769 RTUINT64U uSrc1 = { *puDst };
13770 RTUINT64U uSrc2 = { *puSrc };
13771 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13772
13773 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13774 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13775 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13776 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13777 *puDst = uDst.u;
13778 RT_NOREF(pFpuState);
13779}
13780
13781
13782IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13783{
13784 RTUINT128U uSrc1 = *puDst;
13785
13786 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13787 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13788 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13789 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13790
13791 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13792 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13793 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13794 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13795 RT_NOREF(pFpuState);
13796}
13797
13798
13799IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13800{
13801 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13802
13803 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13804 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13805 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13806 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13807
13808 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13809 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13810 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13811 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13812
13813 puDst->au64[0] = uDst.au64[0];
13814 puDst->au64[1] = uDst.au64[1];
13815}
13816
13817
13818IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13819{
13820 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13821
13822 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13823 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13824 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13825 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13826 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13827 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13828 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13829 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13830
13831 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13832 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13833 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13834 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13835 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13836 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13837 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13838 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13839
13840 puDst->au64[0] = uDst.au64[0];
13841 puDst->au64[1] = uDst.au64[1];
13842 puDst->au64[2] = uDst.au64[2];
13843 puDst->au64[3] = uDst.au64[3];
13844}
13845
13846
13847/*
13848 * PMADDUBSW / VPMADDUBSW
13849 */
13850IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13851{
13852 RTUINT64U uSrc1 = { *puDst };
13853 RTUINT64U uSrc2 = { *puSrc };
13854 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13855
13856 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13857 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13858 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13859 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13860 *puDst = uDst.u;
13861 RT_NOREF(pFpuState);
13862}
13863
13864
13865IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13866{
13867 RTUINT128U uSrc1 = *puDst;
13868
13869 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13870 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13871 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13872 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13873 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13874 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13875 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13876 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13877 RT_NOREF(pFpuState);
13878}
13879
13880
13881IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13882{
13883 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13884
13885 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13886 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13887 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13888 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13889 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13890 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13891 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13892 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13893
13894 puDst->au64[0] = uDst.au64[0];
13895 puDst->au64[1] = uDst.au64[1];
13896}
13897
13898
13899IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13900{
13901 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13902
13903 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13904 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13905 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13906 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13907 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13908 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13909 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13910 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13911 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13912 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13913 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13914 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13915 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13916 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13917 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13918 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13919
13920 puDst->au64[0] = uDst.au64[0];
13921 puDst->au64[1] = uDst.au64[1];
13922 puDst->au64[2] = uDst.au64[2];
13923 puDst->au64[3] = uDst.au64[3];
13924}
13925
13926
13927/*
13928 * PMULHRSW / VPMULHRSW
13929 */
13930#define DO_PMULHRSW(a_Src1, a_Src2) \
13931 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13932
13933IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13934{
13935 RTUINT64U uSrc1 = { *puDst };
13936 RTUINT64U uSrc2 = { *puSrc };
13937 RTUINT64U uDst;
13938
13939 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13940 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13941 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13942 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13943 *puDst = uDst.u;
13944 RT_NOREF(pFpuState);
13945}
13946
13947
13948IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13949{
13950 RTUINT128U uSrc1 = *puDst;
13951
13952 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13953 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13954 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13955 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13956 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13957 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13958 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13959 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13960 RT_NOREF(pFpuState);
13961}
13962
13963
13964IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13965{
13966 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13967
13968 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13969 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13970 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13971 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13972 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13973 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13974 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13975 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13976
13977 puDst->au64[0] = uDst.au64[0];
13978 puDst->au64[1] = uDst.au64[1];
13979}
13980
13981
13982IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13983{
13984 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13985
13986 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13987 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13988 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13989 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13990 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13991 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13992 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13993 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13994 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13995 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13996 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13997 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13998 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13999 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14000 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14001 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14002
14003 puDst->au64[0] = uDst.au64[0];
14004 puDst->au64[1] = uDst.au64[1];
14005 puDst->au64[2] = uDst.au64[2];
14006 puDst->au64[3] = uDst.au64[3];
14007}
14008
14009
14010/*
14011 * PSADBW / VPSADBW
14012 */
14013#ifdef IEM_WITHOUT_ASSEMBLY
14014
14015IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14016{
14017 RTUINT64U uSrc1 = { *puDst };
14018 RTUINT64U uSrc2 = { *puSrc };
14019 RTUINT64U uDst;
14020 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14021 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14022 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14023 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14024 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14025 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14026 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14027 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14028
14029 uDst.au64[0] = 0;
14030 uDst.au16[0] = uSum;
14031 *puDst = uDst.u;
14032}
14033
14034
14035IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14036{
14037 RTUINT128U uSrc1 = *puDst;
14038
14039 puDst->au64[0] = 0;
14040 puDst->au64[1] = 0;
14041
14042 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14043 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14044 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14045 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14046 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14047 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14048 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14049 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14050 puDst->au16[0] = uSum;
14051
14052 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14053 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14054 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14055 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14056 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14057 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14058 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14059 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14060 puDst->au16[4] = uSum;
14061}
14062
14063#endif
14064
14065IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14066{
14067 RTUINT128U uSrc1 = *puSrc1;
14068 RTUINT128U uSrc2 = *puSrc2;
14069
14070 puDst->au64[0] = 0;
14071 puDst->au64[1] = 0;
14072
14073 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14074 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14075 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14076 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14077 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14078 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14079 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14080 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14081 puDst->au16[0] = uSum;
14082
14083 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14084 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14085 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14086 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14087 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14088 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14089 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14090 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14091 puDst->au16[4] = uSum;
14092}
14093
14094IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14095{
14096 RTUINT256U uSrc1 = *puSrc1;
14097 RTUINT256U uSrc2 = *puSrc2;
14098
14099 puDst->au64[0] = 0;
14100 puDst->au64[1] = 0;
14101 puDst->au64[2] = 0;
14102 puDst->au64[3] = 0;
14103
14104 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14105 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14106 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14107 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14108 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14109 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14110 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14111 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14112 puDst->au16[0] = uSum;
14113
14114 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14115 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14116 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14117 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14118 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14119 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14120 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14121 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14122 puDst->au16[4] = uSum;
14123
14124 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14125 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14126 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14127 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14128 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14129 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14130 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14131 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14132 puDst->au16[8] = uSum;
14133
14134 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14135 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14136 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14137 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14138 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14139 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14140 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14141 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14142 puDst->au16[12] = uSum;
14143}
14144
14145
14146/*
14147 * PMULDQ / VPMULDQ
14148 */
14149IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14150{
14151 RTUINT128U uSrc1 = *puDst;
14152
14153 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14154 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14155}
14156
14157IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14158{
14159 RTUINT128U uSrc1 = *puSrc1;
14160 RTUINT128U uSrc2 = *puSrc2;
14161
14162 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14163 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14164}
14165
14166IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14167{
14168 RTUINT256U uSrc1 = *puSrc1;
14169 RTUINT256U uSrc2 = *puSrc2;
14170
14171 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14172 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14173 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14174 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14175}
14176
14177
14178/*
14179 * PMULUDQ / VPMULUDQ
14180 */
14181#ifdef IEM_WITHOUT_ASSEMBLY
14182
14183IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14184{
14185 RTUINT64U uSrc1 = { *puDst };
14186 RTUINT64U uSrc2 = { *puSrc };
14187 ASMCompilerBarrier();
14188 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14189 RT_NOREF(pFpuState);
14190}
14191
14192
14193IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14194{
14195 RTUINT128U uSrc1 = *puDst;
14196 RTUINT128U uSrc2 = *puSrc;
14197 ASMCompilerBarrier();
14198 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14199 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14200 RT_NOREF(pFpuState);
14201}
14202
14203#endif
14204
14205IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14206{
14207 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14208 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14209 ASMCompilerBarrier();
14210 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14211 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14212}
14213
14214
14215IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14216{
14217 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14218 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14219 ASMCompilerBarrier();
14220 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14221 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14222 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14223 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14224}
14225
14226
14227/*
14228 * UNPCKLPS / VUNPCKLPS
14229 */
14230#ifdef IEM_WITHOUT_ASSEMBLY
14231IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14232{
14233 RTUINT128U uSrc1 = *puDst;
14234 RTUINT128U uSrc2 = *puSrc;
14235 ASMCompilerBarrier();
14236 puDst->au32[0] = uSrc1.au32[0];
14237 puDst->au32[1] = uSrc2.au32[0];
14238 puDst->au32[2] = uSrc1.au32[1];
14239 puDst->au32[3] = uSrc2.au32[1];
14240}
14241
14242#endif
14243
14244IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14245{
14246 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14247 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14248 ASMCompilerBarrier();
14249 puDst->au32[0] = uSrc1.au32[0];
14250 puDst->au32[1] = uSrc2.au32[0];
14251 puDst->au32[2] = uSrc1.au32[1];
14252 puDst->au32[3] = uSrc2.au32[1];
14253}
14254
14255
14256IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14257{
14258 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14259 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14260 ASMCompilerBarrier();
14261 puDst->au32[0] = uSrc1.au32[0];
14262 puDst->au32[1] = uSrc2.au32[0];
14263 puDst->au32[2] = uSrc1.au32[1];
14264 puDst->au32[3] = uSrc2.au32[1];
14265
14266 puDst->au32[4] = uSrc1.au32[4];
14267 puDst->au32[5] = uSrc2.au32[4];
14268 puDst->au32[6] = uSrc1.au32[5];
14269 puDst->au32[7] = uSrc2.au32[5];
14270}
14271
14272
14273/*
14274 * UNPCKLPD / VUNPCKLPD
14275 */
14276#ifdef IEM_WITHOUT_ASSEMBLY
14277IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14278{
14279 RTUINT128U uSrc1 = *puDst;
14280 RTUINT128U uSrc2 = *puSrc;
14281 ASMCompilerBarrier();
14282 puDst->au64[0] = uSrc1.au64[0];
14283 puDst->au64[1] = uSrc2.au64[0];
14284}
14285
14286#endif
14287
14288IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14289{
14290 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14291 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14292 ASMCompilerBarrier();
14293 puDst->au64[0] = uSrc1.au64[0];
14294 puDst->au64[1] = uSrc2.au64[0];
14295}
14296
14297
14298IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14299{
14300 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14301 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14302 ASMCompilerBarrier();
14303 puDst->au64[0] = uSrc1.au64[0];
14304 puDst->au64[1] = uSrc2.au64[0];
14305 puDst->au64[2] = uSrc1.au64[2];
14306 puDst->au64[3] = uSrc2.au64[2];
14307}
14308
14309
14310/*
14311 * UNPCKHPS / VUNPCKHPS
14312 */
14313#ifdef IEM_WITHOUT_ASSEMBLY
14314IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14315{
14316 RTUINT128U uSrc1 = *puDst;
14317 RTUINT128U uSrc2 = *puSrc;
14318 ASMCompilerBarrier();
14319 puDst->au32[0] = uSrc1.au32[2];
14320 puDst->au32[1] = uSrc2.au32[2];
14321 puDst->au32[2] = uSrc1.au32[3];
14322 puDst->au32[3] = uSrc2.au32[3];
14323}
14324
14325#endif
14326
14327IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14328{
14329 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14330 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14331 ASMCompilerBarrier();
14332 puDst->au32[0] = uSrc1.au32[2];
14333 puDst->au32[1] = uSrc2.au32[2];
14334 puDst->au32[2] = uSrc1.au32[3];
14335 puDst->au32[3] = uSrc2.au32[3];
14336}
14337
14338
14339IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14340{
14341 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14342 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14343 ASMCompilerBarrier();
14344 puDst->au32[0] = uSrc1.au32[2];
14345 puDst->au32[1] = uSrc2.au32[2];
14346 puDst->au32[2] = uSrc1.au32[3];
14347 puDst->au32[3] = uSrc2.au32[3];
14348
14349 puDst->au32[4] = uSrc1.au32[6];
14350 puDst->au32[5] = uSrc2.au32[6];
14351 puDst->au32[6] = uSrc1.au32[7];
14352 puDst->au32[7] = uSrc2.au32[7];
14353}
14354
14355
14356/*
14357 * UNPCKHPD / VUNPCKHPD
14358 */
14359#ifdef IEM_WITHOUT_ASSEMBLY
14360IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14361{
14362 RTUINT128U uSrc1 = *puDst;
14363 RTUINT128U uSrc2 = *puSrc;
14364 ASMCompilerBarrier();
14365 puDst->au64[0] = uSrc1.au64[1];
14366 puDst->au64[1] = uSrc2.au64[1];
14367}
14368
14369#endif
14370
14371IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14372{
14373 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14374 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14375 ASMCompilerBarrier();
14376 puDst->au64[0] = uSrc1.au64[1];
14377 puDst->au64[1] = uSrc2.au64[1];
14378}
14379
14380
14381IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14382{
14383 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14384 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14385 ASMCompilerBarrier();
14386 puDst->au64[0] = uSrc1.au64[1];
14387 puDst->au64[1] = uSrc2.au64[1];
14388 puDst->au64[2] = uSrc1.au64[3];
14389 puDst->au64[3] = uSrc2.au64[3];
14390}
14391
14392
14393/*
14394 * CRC32 (SEE 4.2).
14395 */
14396
14397IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14398{
14399 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14400}
14401
14402
14403IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14404{
14405 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14406}
14407
14408IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14409{
14410 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14411}
14412
14413IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14414{
14415 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14416}
14417
14418
14419/*
14420 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14421 */
14422#ifdef IEM_WITHOUT_ASSEMBLY
14423IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14424{
14425 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14426 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14427 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14428 fEfl |= X86_EFL_ZF;
14429 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14430 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14431 fEfl |= X86_EFL_CF;
14432 *pfEFlags = fEfl;
14433}
14434#endif
14435
14436IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14437{
14438 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14439 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14440 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14441 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14442 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14443 fEfl |= X86_EFL_ZF;
14444 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14445 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14446 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14447 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14448 fEfl |= X86_EFL_CF;
14449 *pfEFlags = fEfl;
14450}
14451
14452
14453/*
14454 * PMOVSXBW / VPMOVSXBW
14455 */
14456IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14457{
14458 RTUINT64U uSrc1 = { uSrc };
14459 puDst->ai16[0] = uSrc1.ai8[0];
14460 puDst->ai16[1] = uSrc1.ai8[1];
14461 puDst->ai16[2] = uSrc1.ai8[2];
14462 puDst->ai16[3] = uSrc1.ai8[3];
14463 puDst->ai16[4] = uSrc1.ai8[4];
14464 puDst->ai16[5] = uSrc1.ai8[5];
14465 puDst->ai16[6] = uSrc1.ai8[6];
14466 puDst->ai16[7] = uSrc1.ai8[7];
14467}
14468
14469
14470IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14471{
14472 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14473 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14474 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14475 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14476 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14477 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14478 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14479 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14480 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14481 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14482 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14483 puDst->ai16[10] = uSrc1.ai8[10];
14484 puDst->ai16[11] = uSrc1.ai8[11];
14485 puDst->ai16[12] = uSrc1.ai8[12];
14486 puDst->ai16[13] = uSrc1.ai8[13];
14487 puDst->ai16[14] = uSrc1.ai8[14];
14488 puDst->ai16[15] = uSrc1.ai8[15];
14489}
14490
14491
14492/*
14493 * PMOVSXBD / VPMOVSXBD
14494 */
14495IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14496{
14497 RTUINT32U uSrc1 = { uSrc };
14498 puDst->ai32[0] = uSrc1.ai8[0];
14499 puDst->ai32[1] = uSrc1.ai8[1];
14500 puDst->ai32[2] = uSrc1.ai8[2];
14501 puDst->ai32[3] = uSrc1.ai8[3];
14502}
14503
14504
14505IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14506{
14507 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14508 puDst->ai32[0] = uSrc1.ai8[0];
14509 puDst->ai32[1] = uSrc1.ai8[1];
14510 puDst->ai32[2] = uSrc1.ai8[2];
14511 puDst->ai32[3] = uSrc1.ai8[3];
14512 puDst->ai32[4] = uSrc1.ai8[4];
14513 puDst->ai32[5] = uSrc1.ai8[5];
14514 puDst->ai32[6] = uSrc1.ai8[6];
14515 puDst->ai32[7] = uSrc1.ai8[7];
14516}
14517
14518
14519/*
14520 * PMOVSXBQ / VPMOVSXBQ
14521 */
14522IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14523{
14524 RTUINT16U uSrc1 = { uSrc };
14525 puDst->ai64[0] = uSrc1.ai8[0];
14526 puDst->ai64[1] = uSrc1.ai8[1];
14527}
14528
14529
14530IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14531{
14532 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14533 puDst->ai64[0] = uSrc1.ai8[0];
14534 puDst->ai64[1] = uSrc1.ai8[1];
14535 puDst->ai64[2] = uSrc1.ai8[2];
14536 puDst->ai64[3] = uSrc1.ai8[3];
14537}
14538
14539
14540/*
14541 * PMOVSXWD / VPMOVSXWD
14542 */
14543IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14544{
14545 RTUINT64U uSrc1 = { uSrc };
14546 puDst->ai32[0] = uSrc1.ai16[0];
14547 puDst->ai32[1] = uSrc1.ai16[1];
14548 puDst->ai32[2] = uSrc1.ai16[2];
14549 puDst->ai32[3] = uSrc1.ai16[3];
14550}
14551
14552
14553IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14554{
14555 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14556 puDst->ai32[0] = uSrc1.ai16[0];
14557 puDst->ai32[1] = uSrc1.ai16[1];
14558 puDst->ai32[2] = uSrc1.ai16[2];
14559 puDst->ai32[3] = uSrc1.ai16[3];
14560 puDst->ai32[4] = uSrc1.ai16[4];
14561 puDst->ai32[5] = uSrc1.ai16[5];
14562 puDst->ai32[6] = uSrc1.ai16[6];
14563 puDst->ai32[7] = uSrc1.ai16[7];
14564}
14565
14566
14567/*
14568 * PMOVSXWQ / VPMOVSXWQ
14569 */
14570IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14571{
14572 RTUINT32U uSrc1 = { uSrc };
14573 puDst->ai64[0] = uSrc1.ai16[0];
14574 puDst->ai64[1] = uSrc1.ai16[1];
14575}
14576
14577
14578IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14579{
14580 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14581 puDst->ai64[0] = uSrc1.ai16[0];
14582 puDst->ai64[1] = uSrc1.ai16[1];
14583 puDst->ai64[2] = uSrc1.ai16[2];
14584 puDst->ai64[3] = uSrc1.ai16[3];
14585}
14586
14587
14588/*
14589 * PMOVSXDQ / VPMOVSXDQ
14590 */
14591IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14592{
14593 RTUINT64U uSrc1 = { uSrc };
14594 puDst->ai64[0] = uSrc1.ai32[0];
14595 puDst->ai64[1] = uSrc1.ai32[1];
14596}
14597
14598
14599IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14600{
14601 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14602 puDst->ai64[0] = uSrc1.ai32[0];
14603 puDst->ai64[1] = uSrc1.ai32[1];
14604 puDst->ai64[2] = uSrc1.ai32[2];
14605 puDst->ai64[3] = uSrc1.ai32[3];
14606}
14607
14608
14609/*
14610 * PMOVZXBW / VPMOVZXBW
14611 */
14612IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14613{
14614 RTUINT64U uSrc1 = { uSrc };
14615 puDst->au16[0] = uSrc1.au8[0];
14616 puDst->au16[1] = uSrc1.au8[1];
14617 puDst->au16[2] = uSrc1.au8[2];
14618 puDst->au16[3] = uSrc1.au8[3];
14619 puDst->au16[4] = uSrc1.au8[4];
14620 puDst->au16[5] = uSrc1.au8[5];
14621 puDst->au16[6] = uSrc1.au8[6];
14622 puDst->au16[7] = uSrc1.au8[7];
14623}
14624
14625
14626IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14627{
14628 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14629 puDst->au16[ 0] = uSrc1.au8[ 0];
14630 puDst->au16[ 1] = uSrc1.au8[ 1];
14631 puDst->au16[ 2] = uSrc1.au8[ 2];
14632 puDst->au16[ 3] = uSrc1.au8[ 3];
14633 puDst->au16[ 4] = uSrc1.au8[ 4];
14634 puDst->au16[ 5] = uSrc1.au8[ 5];
14635 puDst->au16[ 6] = uSrc1.au8[ 6];
14636 puDst->au16[ 7] = uSrc1.au8[ 7];
14637 puDst->au16[ 8] = uSrc1.au8[ 8];
14638 puDst->au16[ 9] = uSrc1.au8[ 9];
14639 puDst->au16[10] = uSrc1.au8[10];
14640 puDst->au16[11] = uSrc1.au8[11];
14641 puDst->au16[12] = uSrc1.au8[12];
14642 puDst->au16[13] = uSrc1.au8[13];
14643 puDst->au16[14] = uSrc1.au8[14];
14644 puDst->au16[15] = uSrc1.au8[15];
14645}
14646
14647
14648/*
14649 * PMOVZXBD / VPMOVZXBD
14650 */
14651IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14652{
14653 RTUINT32U uSrc1 = { uSrc };
14654 puDst->au32[0] = uSrc1.au8[0];
14655 puDst->au32[1] = uSrc1.au8[1];
14656 puDst->au32[2] = uSrc1.au8[2];
14657 puDst->au32[3] = uSrc1.au8[3];
14658}
14659
14660
14661IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14662{
14663 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14664 puDst->au32[0] = uSrc1.au8[0];
14665 puDst->au32[1] = uSrc1.au8[1];
14666 puDst->au32[2] = uSrc1.au8[2];
14667 puDst->au32[3] = uSrc1.au8[3];
14668 puDst->au32[4] = uSrc1.au8[4];
14669 puDst->au32[5] = uSrc1.au8[5];
14670 puDst->au32[6] = uSrc1.au8[6];
14671 puDst->au32[7] = uSrc1.au8[7];
14672}
14673
14674
14675/*
14676 * PMOVZXBQ / VPMOVZXBQ
14677 */
14678IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14679{
14680 RTUINT16U uSrc1 = { uSrc };
14681 puDst->au64[0] = uSrc1.au8[0];
14682 puDst->au64[1] = uSrc1.au8[1];
14683}
14684
14685
14686IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14687{
14688 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14689 puDst->au64[0] = uSrc1.au8[0];
14690 puDst->au64[1] = uSrc1.au8[1];
14691 puDst->au64[2] = uSrc1.au8[2];
14692 puDst->au64[3] = uSrc1.au8[3];
14693}
14694
14695
14696/*
14697 * PMOVZXWD / VPMOVZXWD
14698 */
14699IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14700{
14701 RTUINT64U uSrc1 = { uSrc };
14702 puDst->au32[0] = uSrc1.au16[0];
14703 puDst->au32[1] = uSrc1.au16[1];
14704 puDst->au32[2] = uSrc1.au16[2];
14705 puDst->au32[3] = uSrc1.au16[3];
14706}
14707
14708
14709IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14710{
14711 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14712 puDst->au32[0] = uSrc1.au16[0];
14713 puDst->au32[1] = uSrc1.au16[1];
14714 puDst->au32[2] = uSrc1.au16[2];
14715 puDst->au32[3] = uSrc1.au16[3];
14716 puDst->au32[4] = uSrc1.au16[4];
14717 puDst->au32[5] = uSrc1.au16[5];
14718 puDst->au32[6] = uSrc1.au16[6];
14719 puDst->au32[7] = uSrc1.au16[7];
14720}
14721
14722
14723/*
14724 * PMOVZXWQ / VPMOVZXWQ
14725 */
14726IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14727{
14728 RTUINT32U uSrc1 = { uSrc };
14729 puDst->au64[0] = uSrc1.au16[0];
14730 puDst->au64[1] = uSrc1.au16[1];
14731}
14732
14733
14734IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14735{
14736 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14737 puDst->au64[0] = uSrc1.au16[0];
14738 puDst->au64[1] = uSrc1.au16[1];
14739 puDst->au64[2] = uSrc1.au16[2];
14740 puDst->au64[3] = uSrc1.au16[3];
14741}
14742
14743
14744/*
14745 * PMOVZXDQ / VPMOVZXDQ
14746 */
14747IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14748{
14749 RTUINT64U uSrc1 = { uSrc };
14750 puDst->au64[0] = uSrc1.au32[0];
14751 puDst->au64[1] = uSrc1.au32[1];
14752}
14753
14754
14755IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14756{
14757 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14758 puDst->au64[0] = uSrc1.au32[0];
14759 puDst->au64[1] = uSrc1.au32[1];
14760 puDst->au64[2] = uSrc1.au32[2];
14761 puDst->au64[3] = uSrc1.au32[3];
14762}
14763
14764/**
14765 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14766 * the SoftFloat 32-bit floating point format (float32_t).
14767 *
14768 * This is only a structure format conversion, nothing else.
14769 */
14770DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14771{
14772 float32_t Tmp;
14773 Tmp.v = pr32Val->u;
14774 return Tmp;
14775}
14776
14777
14778/**
14779 * Converts from SoftFloat 32-bit floating point format (float32_t)
14780 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14781 *
14782 * This is only a structure format conversion, nothing else.
14783 */
14784DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14785{
14786 pr32Dst->u = r32XSrc.v;
14787 return pr32Dst;
14788}
14789
14790
14791/**
14792 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14793 * the SoftFloat 64-bit floating point format (float64_t).
14794 *
14795 * This is only a structure format conversion, nothing else.
14796 */
14797DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14798{
14799 float64_t Tmp;
14800 Tmp.v = pr64Val->u;
14801 return Tmp;
14802}
14803
14804
14805/**
14806 * Converts from SoftFloat 64-bit floating point format (float64_t)
14807 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14808 *
14809 * This is only a structure format conversion, nothing else.
14810 */
14811DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14812{
14813 pr64Dst->u = r64XSrc.v;
14814 return pr64Dst;
14815}
14816
14817
14818/** Initializer for the SoftFloat state structure. */
14819# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14820 { \
14821 softfloat_tininess_afterRounding, \
14822 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14823 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14824 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14825 : (uint8_t)softfloat_round_minMag, \
14826 0, \
14827 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14828 32 /* Rounding precision, not relevant for SIMD. */ \
14829 }
14830
14831#ifdef IEM_WITHOUT_ASSEMBLY
14832
14833/**
14834 * Helper for transfering exception to MXCSR and setting the result value
14835 * accordingly.
14836 *
14837 * @returns Updated MXCSR.
14838 * @param pSoftState The SoftFloat state following the operation.
14839 * @param r32Result The result of the SoftFloat operation.
14840 * @param pr32Result Where to store the result for IEM.
14841 * @param fMxcsr The original MXCSR value.
14842 */
14843DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14844 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14845{
14846 iemFpSoftF32ToIprt(pr32Result, r32Result);
14847
14848 uint8_t fXcpt = pSoftState->exceptionFlags;
14849 if ( (fMxcsr & X86_MXCSR_FZ)
14850 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14851 {
14852 /* Underflow masked and flush to zero is set. */
14853 pr32Result->s.uFraction = 0;
14854 pr32Result->s.uExponent = 0;
14855 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14856 }
14857
14858 /* If DAZ is set \#DE is never set. */
14859 if ( fMxcsr & X86_MXCSR_DAZ
14860 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14861 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14862 fXcpt &= ~X86_MXCSR_DE;
14863
14864 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14865}
14866
14867
14868/**
14869 * Helper for transfering exception to MXCSR and setting the result value
14870 * accordingly - ignores Flush-to-Zero.
14871 *
14872 * @returns Updated MXCSR.
14873 * @param pSoftState The SoftFloat state following the operation.
14874 * @param r32Result The result of the SoftFloat operation.
14875 * @param pr32Result Where to store the result for IEM.
14876 * @param fMxcsr The original MXCSR value.
14877 */
14878DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14879 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14880{
14881 iemFpSoftF32ToIprt(pr32Result, r32Result);
14882
14883 uint8_t fXcpt = pSoftState->exceptionFlags;
14884 /* If DAZ is set \#DE is never set. */
14885 if ( fMxcsr & X86_MXCSR_DAZ
14886 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14887 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14888 fXcpt &= ~X86_MXCSR_DE;
14889
14890 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14891}
14892
14893
14894/**
14895 * Helper for transfering exception to MXCSR and setting the result value
14896 * accordingly.
14897 *
14898 * @returns Updated MXCSR.
14899 * @param pSoftState The SoftFloat state following the operation.
14900 * @param r64Result The result of the SoftFloat operation.
14901 * @param pr64Result Where to store the result for IEM.
14902 * @param fMxcsr The original MXCSR value.
14903 */
14904DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14905 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14906{
14907 iemFpSoftF64ToIprt(pr64Result, r64Result);
14908 uint8_t fXcpt = pSoftState->exceptionFlags;
14909 if ( (fMxcsr & X86_MXCSR_FZ)
14910 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14911 {
14912 /* Underflow masked and flush to zero is set. */
14913 iemFpSoftF64ToIprt(pr64Result, r64Result);
14914 pr64Result->s.uFractionHigh = 0;
14915 pr64Result->s.uFractionLow = 0;
14916 pr64Result->s.uExponent = 0;
14917 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14918 }
14919
14920 /* If DAZ is set \#DE is never set. */
14921 if ( fMxcsr & X86_MXCSR_DAZ
14922 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14923 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14924 fXcpt &= ~X86_MXCSR_DE;
14925
14926 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14927}
14928
14929
14930/**
14931 * Helper for transfering exception to MXCSR and setting the result value
14932 * accordingly - ignores Flush-to-Zero.
14933 *
14934 * @returns Updated MXCSR.
14935 * @param pSoftState The SoftFloat state following the operation.
14936 * @param r64Result The result of the SoftFloat operation.
14937 * @param pr64Result Where to store the result for IEM.
14938 * @param fMxcsr The original MXCSR value.
14939 */
14940DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14941 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14942{
14943 iemFpSoftF64ToIprt(pr64Result, r64Result);
14944
14945 uint8_t fXcpt = pSoftState->exceptionFlags;
14946 /* If DAZ is set \#DE is never set. */
14947 if ( fMxcsr & X86_MXCSR_DAZ
14948 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14949 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14950 fXcpt &= ~X86_MXCSR_DE;
14951
14952 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14953}
14954
14955#endif /* IEM_WITHOUT_ASSEMBLY */
14956
14957
14958/**
14959 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14960 * in MXCSR into account.
14961 *
14962 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14963 * @param pr32Val Where to store the result.
14964 * @param fMxcsr The input MXCSR value.
14965 * @param pr32Src The value to use.
14966 */
14967DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14968{
14969 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14970 {
14971 if (fMxcsr & X86_MXCSR_DAZ)
14972 {
14973 /* De-normals are changed to 0. */
14974 pr32Val->s.fSign = pr32Src->s.fSign;
14975 pr32Val->s.uFraction = 0;
14976 pr32Val->s.uExponent = 0;
14977 return 0;
14978 }
14979
14980 *pr32Val = *pr32Src;
14981 return X86_MXCSR_DE;
14982 }
14983
14984 *pr32Val = *pr32Src;
14985 return 0;
14986}
14987
14988
14989/**
14990 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14991 * in MXCSR into account.
14992 *
14993 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14994 * @param pr64Val Where to store the result.
14995 * @param fMxcsr The input MXCSR value.
14996 * @param pr64Src The value to use.
14997 */
14998DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14999{
15000 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15001 {
15002 if (fMxcsr & X86_MXCSR_DAZ)
15003 {
15004 /* De-normals are changed to 0. */
15005 pr64Val->s64.fSign = pr64Src->s.fSign;
15006 pr64Val->s64.uFraction = 0;
15007 pr64Val->s64.uExponent = 0;
15008 return 0;
15009 }
15010
15011 *pr64Val = *pr64Src;
15012 return X86_MXCSR_DE;
15013 }
15014
15015 *pr64Val = *pr64Src;
15016 return 0;
15017}
15018
15019#ifdef IEM_WITHOUT_ASSEMBLY
15020
15021/**
15022 * Validates the given input operands returning whether the operation can continue or whether one
15023 * of the source operands contains a NaN value, setting the output accordingly.
15024 *
15025 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15026 * @param pr32Res Where to store the result in case the operation can't continue.
15027 * @param pr32Val1 The first input operand.
15028 * @param pr32Val2 The second input operand.
15029 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15030 */
15031DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15032{
15033 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15034 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15035 if (cSNan + cQNan == 2)
15036 {
15037 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15038 *pr32Res = *pr32Val1;
15039 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15040 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15041 return true;
15042 }
15043 if (cSNan)
15044 {
15045 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15046 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15047 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15048 *pfMxcsr |= X86_MXCSR_IE;
15049 return true;
15050 }
15051 if (cQNan)
15052 {
15053 /* The QNan operand is placed into the result. */
15054 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15055 return true;
15056 }
15057
15058 Assert(!cQNan && !cSNan);
15059 return false;
15060}
15061
15062
15063/**
15064 * Validates the given double precision input operands returning whether the operation can continue or whether one
15065 * of the source operands contains a NaN value, setting the output accordingly.
15066 *
15067 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15068 * @param pr64Res Where to store the result in case the operation can't continue.
15069 * @param pr64Val1 The first input operand.
15070 * @param pr64Val2 The second input operand.
15071 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15072 */
15073DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15074{
15075 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15076 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15077 if (cSNan + cQNan == 2)
15078 {
15079 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15080 *pr64Res = *pr64Val1;
15081 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15082 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15083 return true;
15084 }
15085 if (cSNan)
15086 {
15087 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15088 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15089 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15090 *pfMxcsr |= X86_MXCSR_IE;
15091 return true;
15092 }
15093 if (cQNan)
15094 {
15095 /* The QNan operand is placed into the result. */
15096 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15097 return true;
15098 }
15099
15100 Assert(!cQNan && !cSNan);
15101 return false;
15102}
15103
15104
15105/**
15106 * Validates the given single input operand returning whether the operation can continue or whether
15107 * contains a NaN value, setting the output accordingly.
15108 *
15109 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15110 * @param pr32Res Where to store the result in case the operation can't continue.
15111 * @param pr32Val The input operand.
15112 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15113 */
15114DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15115{
15116 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15117 {
15118 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15119 *pr32Res = *pr32Val;
15120 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15121 *pfMxcsr |= X86_MXCSR_IE;
15122 return true;
15123 }
15124 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15125 {
15126 /* The QNan operand is placed into the result. */
15127 *pr32Res = *pr32Val;
15128 return true;
15129 }
15130
15131 return false;
15132}
15133
15134
15135/**
15136 * Validates the given double input operand returning whether the operation can continue or whether
15137 * contains a NaN value, setting the output accordingly.
15138 *
15139 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15140 * @param pr64Res Where to store the result in case the operation can't continue.
15141 * @param pr64Val The input operand.
15142 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15143 */
15144DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15145{
15146 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15147 {
15148 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15149 *pr64Res = *pr64Val;
15150 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15151 *pfMxcsr |= X86_MXCSR_IE;
15152 return true;
15153 }
15154 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15155 {
15156 /* The QNan operand is placed into the result. */
15157 *pr64Res = *pr64Val;
15158 return true;
15159 }
15160
15161 return false;
15162}
15163
15164#endif /* IEM_WITHOUT_ASSEMBLY */
15165
15166/**
15167 * ADDPS
15168 */
15169#ifdef IEM_WITHOUT_ASSEMBLY
15170static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15171{
15172 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15173 return fMxcsr;
15174
15175 RTFLOAT32U r32Src1, r32Src2;
15176 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15177 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15178 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15179 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15180 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15181}
15182
15183
15184IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15185{
15186 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15187 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15188 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15189 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15190}
15191#endif
15192
15193
15194/**
15195 * ADDSS
15196 */
15197#ifdef IEM_WITHOUT_ASSEMBLY
15198IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15199{
15200 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15201 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15202 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15203 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15204}
15205#endif
15206
15207
15208/**
15209 * ADDPD
15210 */
15211#ifdef IEM_WITHOUT_ASSEMBLY
15212static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15213{
15214 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15215 return fMxcsr;
15216
15217 RTFLOAT64U r64Src1, r64Src2;
15218 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15219 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15220 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15221 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15222 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15223}
15224
15225
15226IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15227{
15228 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15229 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15230}
15231#endif
15232
15233
15234/**
15235 * ADDSD
15236 */
15237#ifdef IEM_WITHOUT_ASSEMBLY
15238IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15239{
15240 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15241 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15242}
15243#endif
15244
15245
15246/**
15247 * MULPS
15248 */
15249#ifdef IEM_WITHOUT_ASSEMBLY
15250static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15251{
15252 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15253 return fMxcsr;
15254
15255 RTFLOAT32U r32Src1, r32Src2;
15256 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15257 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15258 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15259 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15260 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15261}
15262
15263
15264IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15265{
15266 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15267 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15268 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15269 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15270}
15271#endif
15272
15273
15274/**
15275 * MULSS
15276 */
15277#ifdef IEM_WITHOUT_ASSEMBLY
15278IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15279{
15280 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15281 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15282 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15283 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15284}
15285#endif
15286
15287
15288/**
15289 * MULPD
15290 */
15291#ifdef IEM_WITHOUT_ASSEMBLY
15292static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15293{
15294 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15295 return fMxcsr;
15296
15297 RTFLOAT64U r64Src1, r64Src2;
15298 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15299 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15300 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15301 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15302 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15303}
15304
15305
15306IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15307{
15308 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15309 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15310}
15311#endif
15312
15313
15314/**
15315 * MULSD
15316 */
15317#ifdef IEM_WITHOUT_ASSEMBLY
15318IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15319{
15320 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15321 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15322}
15323#endif
15324
15325
15326/**
15327 * SUBPS
15328 */
15329#ifdef IEM_WITHOUT_ASSEMBLY
15330static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15331{
15332 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15333 return fMxcsr;
15334
15335 RTFLOAT32U r32Src1, r32Src2;
15336 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15337 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15338 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15339 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15340 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15341}
15342
15343
15344IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15345{
15346 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15347 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15348 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15349 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15350}
15351#endif
15352
15353
15354/**
15355 * SUBSS
15356 */
15357#ifdef IEM_WITHOUT_ASSEMBLY
15358IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15359{
15360 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15361 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15362 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15363 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15364}
15365#endif
15366
15367
15368/**
15369 * SUBPD
15370 */
15371#ifdef IEM_WITHOUT_ASSEMBLY
15372static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15373{
15374 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15375 return fMxcsr;
15376
15377 RTFLOAT64U r64Src1, r64Src2;
15378 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15379 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15380 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15381 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15382 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15383}
15384
15385
15386IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15387{
15388 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15389 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15390}
15391#endif
15392
15393
15394/**
15395 * SUBSD
15396 */
15397#ifdef IEM_WITHOUT_ASSEMBLY
15398IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15399{
15400 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15401 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15402}
15403#endif
15404
15405
15406/**
15407 * MINPS
15408 */
15409#ifdef IEM_WITHOUT_ASSEMBLY
15410static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15411{
15412 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15413 {
15414 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15415 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15416 return fMxcsr | X86_MXCSR_IE;
15417 }
15418
15419 RTFLOAT32U r32Src1, r32Src2;
15420 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15421 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15422 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15423 {
15424 *pr32Res = r32Src2;
15425 return fMxcsr;
15426 }
15427
15428 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15429 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15430 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15431 fLe
15432 ? iemFpSoftF32FromIprt(&r32Src1)
15433 : iemFpSoftF32FromIprt(&r32Src2),
15434 pr32Res, fMxcsr);
15435}
15436
15437
15438IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15439{
15440 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15441 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15442 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15443 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15444}
15445#endif
15446
15447
15448/**
15449 * MINSS
15450 */
15451#ifdef IEM_WITHOUT_ASSEMBLY
15452IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15453{
15454 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15455 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15456 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15457 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15458}
15459#endif
15460
15461
15462/**
15463 * MINPD
15464 */
15465#ifdef IEM_WITHOUT_ASSEMBLY
15466static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15467{
15468 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15469 {
15470 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15471 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15472 return fMxcsr | X86_MXCSR_IE;
15473 }
15474
15475 RTFLOAT64U r64Src1, r64Src2;
15476 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15477 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15478 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15479 {
15480 *pr64Res = r64Src2;
15481 return fMxcsr;
15482 }
15483
15484 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15485 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15486 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15487 fLe
15488 ? iemFpSoftF64FromIprt(&r64Src1)
15489 : iemFpSoftF64FromIprt(&r64Src2),
15490 pr64Res, fMxcsr);
15491}
15492
15493
15494IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15495{
15496 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15497 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15498}
15499#endif
15500
15501
15502/**
15503 * MINSD
15504 */
15505#ifdef IEM_WITHOUT_ASSEMBLY
15506IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15507{
15508 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15509 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15510}
15511#endif
15512
15513
15514/**
15515 * DIVPS
15516 */
15517#ifdef IEM_WITHOUT_ASSEMBLY
15518static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15519{
15520 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15521 return fMxcsr;
15522
15523 RTFLOAT32U r32Src1, r32Src2;
15524 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15525 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15526 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15527 {
15528 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15529 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15530 {
15531 *pr32Res = g_ar32QNaN[1];
15532 return fMxcsr | X86_MXCSR_IE;
15533 }
15534 else if (RTFLOAT32U_IS_INF(&r32Src1))
15535 {
15536 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15537 return fMxcsr;
15538 }
15539 else
15540 {
15541 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15542 return fMxcsr | X86_MXCSR_ZE;
15543 }
15544 }
15545
15546 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15547 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15548 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15549}
15550
15551
15552IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15553{
15554 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15555 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15556 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15557 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15558}
15559#endif
15560
15561
15562/**
15563 * DIVSS
15564 */
15565#ifdef IEM_WITHOUT_ASSEMBLY
15566IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15567{
15568 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15569 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15570 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15571 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15572}
15573#endif
15574
15575
15576/**
15577 * DIVPD
15578 */
15579#ifdef IEM_WITHOUT_ASSEMBLY
15580static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15581{
15582 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15583 return fMxcsr;
15584
15585 RTFLOAT64U r64Src1, r64Src2;
15586 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15587 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15588 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15589 {
15590 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15591 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15592 {
15593 *pr64Res = g_ar64QNaN[1];
15594 return fMxcsr | X86_MXCSR_IE;
15595 }
15596 else if (RTFLOAT64U_IS_INF(&r64Src1))
15597 {
15598 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15599 return fMxcsr;
15600 }
15601 else
15602 {
15603 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15604 return fMxcsr | X86_MXCSR_ZE;
15605 }
15606 }
15607
15608 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15609 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15610 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15611}
15612
15613
15614IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15615{
15616 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15617 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15618}
15619#endif
15620
15621
15622/**
15623 * DIVSD
15624 */
15625#ifdef IEM_WITHOUT_ASSEMBLY
15626IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15627{
15628 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15629 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15630}
15631#endif
15632
15633
15634/**
15635 * MAXPS
15636 */
15637#ifdef IEM_WITHOUT_ASSEMBLY
15638static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15639{
15640 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15641 {
15642 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15643 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15644 return fMxcsr | X86_MXCSR_IE;
15645 }
15646
15647 RTFLOAT32U r32Src1, r32Src2;
15648 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15649 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15650 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15651 {
15652 *pr32Res = r32Src2;
15653 return fMxcsr;
15654 }
15655
15656 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15657 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15658 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15659 fLe
15660 ? iemFpSoftF32FromIprt(&r32Src2)
15661 : iemFpSoftF32FromIprt(&r32Src1),
15662 pr32Res, fMxcsr);
15663}
15664
15665
15666IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15667{
15668 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15669 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15670 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15671 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15672}
15673#endif
15674
15675
15676/**
15677 * MAXSS
15678 */
15679#ifdef IEM_WITHOUT_ASSEMBLY
15680IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15681{
15682 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15683 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15684 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15685 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15686}
15687#endif
15688
15689
15690/**
15691 * MAXPD
15692 */
15693#ifdef IEM_WITHOUT_ASSEMBLY
15694static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15695{
15696 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15697 {
15698 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15699 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15700 return fMxcsr | X86_MXCSR_IE;
15701 }
15702
15703 RTFLOAT64U r64Src1, r64Src2;
15704 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15705 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15706 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15707 {
15708 *pr64Res = r64Src2;
15709 return fMxcsr;
15710 }
15711
15712 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15713 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15714 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15715 fLe
15716 ? iemFpSoftF64FromIprt(&r64Src2)
15717 : iemFpSoftF64FromIprt(&r64Src1),
15718 pr64Res, fMxcsr);
15719}
15720
15721
15722IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15723{
15724 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15725 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15726}
15727#endif
15728
15729
15730/**
15731 * MAXSD
15732 */
15733#ifdef IEM_WITHOUT_ASSEMBLY
15734IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15735{
15736 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15737 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15738}
15739#endif
15740
15741
15742/**
15743 * CVTSS2SD
15744 */
15745#ifdef IEM_WITHOUT_ASSEMBLY
15746static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15747{
15748 RTFLOAT32U r32Src1;
15749 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15750
15751 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15752 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15753 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15754}
15755
15756
15757IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15758{
15759 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15760 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15761}
15762#endif
15763
15764
15765/**
15766 * CVTSD2SS
15767 */
15768#ifdef IEM_WITHOUT_ASSEMBLY
15769static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15770{
15771 RTFLOAT64U r64Src1;
15772 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15773
15774 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15775 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15776 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15777}
15778
15779
15780IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15781{
15782 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15783 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15784 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15785 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15786}
15787#endif
15788
15789
15790/**
15791 * HADDPS
15792 */
15793#ifdef IEM_WITHOUT_ASSEMBLY
15794IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15795{
15796 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15797 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15798 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15799 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15800}
15801#endif
15802
15803
15804/**
15805 * HADDPD
15806 */
15807#ifdef IEM_WITHOUT_ASSEMBLY
15808IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15809{
15810 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15811 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15812}
15813#endif
15814
15815
15816/**
15817 * HSUBPS
15818 */
15819#ifdef IEM_WITHOUT_ASSEMBLY
15820IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15821{
15822 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15823 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15824 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15825 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15826}
15827#endif
15828
15829
15830/**
15831 * HSUBPD
15832 */
15833#ifdef IEM_WITHOUT_ASSEMBLY
15834IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15835{
15836 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15837 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15838}
15839#endif
15840
15841
15842/**
15843 * SQRTPS
15844 */
15845#ifdef IEM_WITHOUT_ASSEMBLY
15846static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15847{
15848 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15849 return fMxcsr;
15850
15851 RTFLOAT32U r32Src;
15852 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15853 if (RTFLOAT32U_IS_ZERO(&r32Src))
15854 {
15855 *pr32Res = r32Src;
15856 return fMxcsr;
15857 }
15858 else if (r32Src.s.fSign)
15859 {
15860 *pr32Res = g_ar32QNaN[1];
15861 return fMxcsr | X86_MXCSR_IE;
15862 }
15863
15864 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15865 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15866 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15867}
15868
15869
15870IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15871{
15872 RT_NOREF(puSrc1);
15873
15874 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15875 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15876 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15877 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15878}
15879#endif
15880
15881
15882/**
15883 * SQRTSS
15884 */
15885#ifdef IEM_WITHOUT_ASSEMBLY
15886IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15887{
15888 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15889 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15890 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15891 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15892}
15893#endif
15894
15895
15896/**
15897 * SQRTPD
15898 */
15899#ifdef IEM_WITHOUT_ASSEMBLY
15900static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15901{
15902 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15903 return fMxcsr;
15904
15905 RTFLOAT64U r64Src;
15906 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15907 if (RTFLOAT64U_IS_ZERO(&r64Src))
15908 {
15909 *pr64Res = r64Src;
15910 return fMxcsr;
15911 }
15912 else if (r64Src.s.fSign)
15913 {
15914 *pr64Res = g_ar64QNaN[1];
15915 return fMxcsr | X86_MXCSR_IE;
15916 }
15917
15918 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15919 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15920 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15921}
15922
15923
15924IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15925{
15926 RT_NOREF(puSrc1);
15927
15928 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15929 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15930}
15931#endif
15932
15933
15934/**
15935 * SQRTSD
15936 */
15937#ifdef IEM_WITHOUT_ASSEMBLY
15938IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15939{
15940 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15941 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15942}
15943#endif
15944
15945
15946#ifdef IEM_WITHOUT_ASSEMBLY
15947/**
15948 * RSQRTPS
15949 */
15950static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15951{
15952 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15953 return fMxcsr;
15954
15955 RTFLOAT32U r32Src;
15956 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
15957 if (RTFLOAT32U_IS_ZERO(&r32Src))
15958 {
15959 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
15960 return fMxcsr;
15961 }
15962 else if (r32Src.s.fSign)
15963 {
15964 *pr32Res = g_ar32QNaN[1];
15965 return fMxcsr | X86_MXCSR_IE;
15966 }
15967
15968 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15969 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15970 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15971}
15972
15973
15974IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15975{
15976 RT_NOREF(puSrc1);
15977
15978 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15979 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15980 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15981 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15982}
15983
15984
15985/**
15986 * RSQRTSS
15987 */
15988IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15989{
15990 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15991 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15992 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15993 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15994}
15995#endif
15996
15997
15998/**
15999 * ADDSUBPS
16000 */
16001#ifdef IEM_WITHOUT_ASSEMBLY
16002IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16003{
16004 RT_NOREF(puSrc1);
16005
16006 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
16007 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
16008 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
16009 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16010}
16011#endif
16012
16013
16014/**
16015 * ADDSUBPD
16016 */
16017#ifdef IEM_WITHOUT_ASSEMBLY
16018IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16019{
16020 RT_NOREF(puSrc1);
16021
16022 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
16023 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16024}
16025#endif
16026
16027
16028/**
16029 * CVTPD2PS
16030 */
16031#ifdef IEM_WITHOUT_ASSEMBLY
16032static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16033{
16034 RTFLOAT64U r64Src1;
16035 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16036
16037 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16038 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16039 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16040}
16041
16042
16043IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16044{
16045 RT_NOREF(puSrc1);
16046
16047 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16048 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16049 pResult->uResult.au32[2] = 0;
16050 pResult->uResult.au32[3] = 0;
16051}
16052#endif
16053
16054
16055/**
16056 * CVTPS2PD
16057 */
16058#ifdef IEM_WITHOUT_ASSEMBLY
16059static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16060{
16061 RTFLOAT32U r32Src1;
16062 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16063
16064 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16065 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16066 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16067}
16068
16069
16070IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16071{
16072 RT_NOREF(puSrc1);
16073
16074 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16075 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16076}
16077#endif
16078
16079
16080/**
16081 * CVTDQ2PS
16082 */
16083#ifdef IEM_WITHOUT_ASSEMBLY
16084static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16085{
16086 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16087 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16088 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16089}
16090
16091
16092IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16093{
16094 RT_NOREF(puSrc1);
16095
16096 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16097 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16098 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
16099 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
16100}
16101#endif
16102
16103
16104/**
16105 * CVTPS2DQ
16106 */
16107#ifdef IEM_WITHOUT_ASSEMBLY
16108static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16109{
16110 RTFLOAT32U r32Src;
16111 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16112
16113 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16114 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16115 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16116}
16117
16118
16119IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16120{
16121 RT_NOREF(puSrc1);
16122
16123 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16124 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16125 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16126 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16127}
16128#endif
16129
16130
16131/**
16132 * CVTTPS2DQ
16133 */
16134#ifdef IEM_WITHOUT_ASSEMBLY
16135static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16136{
16137 RTFLOAT32U r32Src;
16138 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16139
16140 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16141 SoftState.roundingMode = softfloat_round_minMag;
16142 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16143 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16144}
16145
16146
16147IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16148{
16149 RT_NOREF(puSrc1);
16150
16151 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16152 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16153 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16154 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16155}
16156#endif
16157
16158
16159/**
16160 * CVTTPD2DQ
16161 */
16162#ifdef IEM_WITHOUT_ASSEMBLY
16163static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16164{
16165 RTFLOAT64U r64Src;
16166 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16167
16168 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16169 SoftState.roundingMode = softfloat_round_minMag;
16170 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16171 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16172}
16173
16174
16175IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16176{
16177 RT_NOREF(puSrc1);
16178
16179 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16180 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16181 pResult->uResult.au64[1] = 0;
16182}
16183#endif
16184
16185
16186/**
16187 * CVTDQ2PD
16188 */
16189#ifdef IEM_WITHOUT_ASSEMBLY
16190static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16191{
16192 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16193 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16194 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16195}
16196
16197
16198IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16199{
16200 RT_NOREF(puSrc1);
16201
16202 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16203 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16204}
16205#endif
16206
16207
16208/**
16209 * CVTPD2DQ
16210 */
16211#ifdef IEM_WITHOUT_ASSEMBLY
16212static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16213{
16214 RTFLOAT64U r64Src;
16215 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16216
16217 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16218 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16219 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16220}
16221
16222
16223IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16224{
16225 RT_NOREF(puSrc1);
16226
16227 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16228 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16229 pResult->uResult.au64[1] = 0;
16230}
16231#endif
16232
16233
16234/**
16235 * [V]SHUFPS
16236 */
16237#ifdef IEM_WITHOUT_ASSEMBLY
16238IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16239{
16240 RTUINT128U const uSrc1 = *puDst;
16241 RTUINT128U const uSrc2 = *puSrc;
16242 ASMCompilerBarrier();
16243 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16244 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16245 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16246 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16247}
16248#endif
16249
16250
16251IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16252{
16253 RTUINT128U const uSrc1 = *puSrc1;
16254 RTUINT128U const uSrc2 = *puSrc2;
16255 ASMCompilerBarrier();
16256 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16257 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16258 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16259 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16260}
16261
16262
16263IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16264{
16265 RTUINT256U const uSrc1 = *puSrc1;
16266 RTUINT256U const uSrc2 = *puSrc2;
16267 ASMCompilerBarrier();
16268 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16269 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16270 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16271 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16272
16273 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
16274 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
16275 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
16276 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
16277}
16278
16279
16280/**
16281 * [V]SHUFPD
16282 */
16283#ifdef IEM_WITHOUT_ASSEMBLY
16284IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16285{
16286 RTUINT128U const uSrc1 = *puDst;
16287 RTUINT128U const uSrc2 = *puSrc;
16288 ASMCompilerBarrier();
16289 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16290 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16291}
16292#endif
16293
16294
16295IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16296{
16297 RTUINT128U const uSrc1 = *puSrc1;
16298 RTUINT128U const uSrc2 = *puSrc2;
16299 ASMCompilerBarrier();
16300 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16301 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16302}
16303
16304
16305IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16306{
16307 RTUINT256U const uSrc1 = *puSrc1;
16308 RTUINT256U const uSrc2 = *puSrc2;
16309 ASMCompilerBarrier();
16310 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16311 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16312 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
16313 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
16314}
16315
16316
16317/*
16318 * PHMINPOSUW / VPHMINPOSUW
16319 */
16320IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16321{
16322 uint16_t u16Min = puSrc->au16[0];
16323 uint8_t idxMin = 0;
16324
16325 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
16326 if (puSrc->au16[i] < u16Min)
16327 {
16328 u16Min = puSrc->au16[i];
16329 idxMin = i;
16330 }
16331
16332 puDst->au64[0] = 0;
16333 puDst->au64[1] = 0;
16334 puDst->au16[0] = u16Min;
16335 puDst->au16[1] = idxMin;
16336}
16337
16338
16339IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16340{
16341 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
16342}
16343
16344
16345/*
16346 * [V]PBLENDVB
16347 */
16348IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16349{
16350 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16351 if (puMask->au8[i] & RT_BIT(7))
16352 puDst->au8[i] = puSrc->au8[i];
16353}
16354
16355
16356IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16357{
16358 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16359 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16360}
16361
16362
16363IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16364{
16365 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16366 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16367}
16368
16369
16370/*
16371 * [V]BLENDVPS
16372 */
16373IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16374{
16375 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16376 if (puMask->au32[i] & RT_BIT_32(31))
16377 puDst->au32[i] = puSrc->au32[i];
16378}
16379
16380
16381IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16382{
16383 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16384 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16385}
16386
16387
16388IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16389{
16390 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16391 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16392}
16393
16394
16395/*
16396 * [V]BLENDVPD
16397 */
16398IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16399{
16400 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
16401 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
16402}
16403
16404
16405IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16406{
16407 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16408 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16409}
16410
16411
16412IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16413{
16414 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16415 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16416}
16417
16418
16419/**
16420 * [V]PALIGNR
16421 */
16422IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
16423{
16424 uint64_t const u64Src1 = *pu64Dst;
16425 ASMCompilerBarrier();
16426
16427 if (bEvil >= 16)
16428 *pu64Dst = 0;
16429 else if (bEvil >= 8)
16430 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
16431 else
16432 {
16433 uint8_t cShift = bEvil * 8;
16434 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
16435 | (u64Src2 >> cShift);
16436 }
16437}
16438
16439
16440IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16441{
16442 RTUINT128U const uSrc1 = *puDst;
16443 RTUINT128U const uSrc2 = *puSrc;
16444 ASMCompilerBarrier();
16445
16446 puDst->au64[0] = 0;
16447 puDst->au64[1] = 0;
16448 if (bEvil >= 32)
16449 { /* Everything stays 0. */ }
16450 else if (bEvil >= 16)
16451 {
16452 bEvil -= 16;
16453 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16454 puDst->au8[i - bEvil] = uSrc1.au8[i];
16455 }
16456 else
16457 {
16458 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16459 puDst->au8[i] = uSrc2.au8[i + bEvil];
16460 for (uint8_t i = 0; i < bEvil; i++)
16461 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16462 }
16463}
16464
16465
16466IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16467{
16468 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16469 RTUINT128U const uSrc2 = *puSrc2;
16470 ASMCompilerBarrier();
16471
16472 puDst->au64[0] = 0;
16473 puDst->au64[1] = 0;
16474 if (bEvil >= 32)
16475 { /* Everything stays 0. */ }
16476 else if (bEvil >= 16)
16477 {
16478 bEvil -= 16;
16479 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16480 puDst->au8[i - bEvil] = uSrc1.au8[i];
16481 }
16482 else
16483 {
16484 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16485 puDst->au8[i] = uSrc2.au8[i + bEvil];
16486 for (uint8_t i = 0; i < bEvil; i++)
16487 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16488 }
16489}
16490
16491
16492IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16493{
16494 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16495 RTUINT256U const uSrc2 = *puSrc2;
16496 ASMCompilerBarrier();
16497
16498 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16499 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16500}
16501
16502
16503/**
16504 * [V]PBLENDW
16505 */
16506IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16507{
16508 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16509 if (bEvil & RT_BIT(i))
16510 puDst->au16[i] = puSrc->au16[i];
16511}
16512
16513
16514IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16515{
16516 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16517 if (bEvil & RT_BIT(i))
16518 puDst->au16[i] = puSrc2->au16[i];
16519 else
16520 puDst->au16[i] = puSrc1->au16[i];
16521}
16522
16523
16524IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16525{
16526 for (uint8_t i = 0; i < 8; i++)
16527 if (bEvil & RT_BIT(i))
16528 {
16529 puDst->au16[ i] = puSrc2->au16[ i];
16530 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16531 }
16532 else
16533 {
16534 puDst->au16[ i] = puSrc1->au16[ i];
16535 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16536 }
16537}
16538
16539
16540/**
16541 * [V]BLENDPS
16542 */
16543IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16544{
16545 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16546 if (bEvil & RT_BIT(i))
16547 puDst->au32[i] = puSrc->au32[i];
16548}
16549
16550
16551IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16552{
16553 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16554 if (bEvil & RT_BIT(i))
16555 puDst->au32[i] = puSrc2->au32[i];
16556 else
16557 puDst->au32[i] = puSrc1->au32[i];
16558}
16559
16560
16561IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16562{
16563 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16564 if (bEvil & RT_BIT(i))
16565 puDst->au32[i] = puSrc2->au32[i];
16566 else
16567 puDst->au32[i] = puSrc1->au32[i];
16568}
16569
16570
16571/**
16572 * [V]BLENDPD
16573 */
16574IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16575{
16576 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16577 if (bEvil & RT_BIT(i))
16578 puDst->au64[i] = puSrc->au64[i];
16579}
16580
16581
16582IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16583{
16584 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16585 if (bEvil & RT_BIT(i))
16586 puDst->au64[i] = puSrc2->au64[i];
16587 else
16588 puDst->au64[i] = puSrc1->au64[i];
16589}
16590
16591
16592IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16593{
16594 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16595 if (bEvil & RT_BIT(i))
16596 puDst->au64[i] = puSrc2->au64[i];
16597 else
16598 puDst->au64[i] = puSrc1->au64[i];
16599}
16600
16601
16602/**
16603 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16604 */
16605
16606static uint8_t iemAImpl_aes_sbox[] = {
16607 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16608 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16609 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16610 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16611 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16612 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16613 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16614 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16615 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16616 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
16617 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16618 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
16619 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
16620 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
16621 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
16622 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
16623};
16624
16625/* The InvS-Box lookup table. */
16626static uint8_t iemAImpl_aes_inv_sbox[] = {
16627 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
16628 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
16629 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
16630 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
16631 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
16632 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
16633 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
16634 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
16635 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
16636 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
16637 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
16638 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
16639 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
16640 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
16641 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
16642 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
16643};
16644
16645/* The ShiftRows lookup table. */
16646static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
16647 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
16648};
16649
16650/* The InvShiftRows lookup table. */
16651static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
16652 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
16653};
16654
16655static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
16656{
16657 RTUINT128U uVal;
16658 int i;
16659
16660 for (i = 0; i < 16; ++i)
16661 uVal.au8[i] = abSubst[puSrc->au8[i]];
16662
16663 return uVal;
16664}
16665
16666static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
16667{
16668 return (u << 1) ^ (((u >> 7) & 1) * 27);
16669}
16670
16671static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
16672{
16673 RTUINT128U uVal;
16674 int i;
16675 uint8_t tmp;
16676
16677 for (i = 0; i < 16; i += 4) {
16678 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
16679 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
16680 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
16681 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
16682 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
16683 }
16684
16685 return uVal;
16686}
16687
16688static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
16689{
16690 RTUINT128U uVal;
16691 int i;
16692
16693 for (i = 0; i < 16; ++i)
16694 uVal.au8[i] = puSrc->au8[abShift[i]];
16695
16696 return uVal;
16697}
16698
16699static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
16700{
16701 uint8_t val;
16702
16703 val = ((b >> 0) & 1) * a;
16704 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
16705 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
16706 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
16707 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
16708
16709 return val;
16710}
16711
16712static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
16713{
16714 RTUINT128U uVal;
16715 int i;
16716
16717 for (i = 0; i < 16; i += 4) {
16718 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
16719 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
16720 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
16721 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
16722 }
16723
16724 return uVal;
16725}
16726
16727static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
16728{
16729 RTUINT32U uTmp;
16730
16731 uTmp.au32[0] = w;
16732 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
16733 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
16734 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
16735 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
16736
16737 return uTmp.au32[0];
16738}
16739
16740static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
16741{
16742 return (w << 24) | (w >> 8);
16743}
16744
16745/**
16746 * [V]AESKEYGENASSIST
16747 */
16748IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
16749{
16750 RTUINT128U uTmp;
16751 uint32_t uRCon = bImm; /* Round constant. */
16752
16753 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
16754 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
16755 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
16756 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
16757
16758 *puDst = uTmp;
16759}
16760
16761
16762/**
16763 * [V]AESIMC
16764 */
16765IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16766{
16767 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
16768}
16769
16770
16771/**
16772 * [V]AESENC
16773 */
16774IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16775{
16776 RTUINT128U uTmp;
16777
16778 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16779 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16780 uTmp = iemAImpl_aes_mix_col(&uTmp);
16781 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16782 uTmp.au64[1] ^= puSrc->au64[1];
16783
16784 *puDst = uTmp;
16785}
16786
16787
16788/**
16789 * [V]AESENCLAST
16790 */
16791IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16792{
16793 RTUINT128U uTmp;
16794
16795 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16796 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16797 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16798 uTmp.au64[1] ^= puSrc->au64[1];
16799
16800 *puDst = uTmp;
16801}
16802
16803
16804/**
16805 * [V]AESDEC
16806 */
16807IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16808{
16809 RTUINT128U uTmp;
16810
16811 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16812 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16813 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
16814 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16815 uTmp.au64[1] ^= puSrc->au64[1];
16816
16817 *puDst = uTmp;
16818}
16819
16820
16821/**
16822 * [V]AESDECLAST
16823 */
16824IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16825{
16826 RTUINT128U uTmp;
16827
16828 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16829 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16830 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16831 uTmp.au64[1] ^= puSrc->au64[1];
16832
16833 *puDst = uTmp;
16834}
16835
16836
16837/**
16838 * [V]PCMPISTRI
16839 */
16840
16841/**
16842 * Does the comparisons based on the mode and source input format.
16843 */
16844static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
16845{
16846#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
16847 do \
16848 { \
16849 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
16850 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
16851 { \
16852 switch (a_bAggOp) \
16853 { \
16854 case 0: \
16855 case 2: \
16856 case 3: \
16857 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
16858 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
16859 break; \
16860 case 1: \
16861 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
16862 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
16863 break; \
16864 default: \
16865 AssertReleaseFailed(); \
16866 } \
16867 } \
16868 } while(0)
16869
16870 uint8_t bAggOp = (bImm >> 2) & 0x3;
16871 switch (bImm & 0x3)
16872 {
16873 case 0:
16874 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
16875 break;
16876 case 1:
16877 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
16878 break;
16879 case 2:
16880 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
16881 break;
16882 case 3:
16883 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
16884 break;
16885 default:
16886 AssertReleaseFailed();
16887 }
16888#undef PCMPXSTRX_CMP_CASE
16889}
16890
16891static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
16892{
16893 if (bImm & 0x1)
16894 {
16895 /* Words -> 8 elements. */
16896 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
16897 if (puSrc->au16[i] == 0)
16898 return i;
16899
16900 return 8;
16901 }
16902 else
16903 {
16904 /* Bytes -> 16 elements. */
16905 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
16906 if (puSrc->au8[i] == 0)
16907 return i;
16908
16909 return 16;
16910 }
16911}
16912
16913static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
16914{
16915 if (bImm & 0x1)
16916 {
16917 if (i64Len > -8 && i64Len < 8)
16918 return RT_ABS(i64Len);
16919
16920 return 8;
16921 }
16922 else
16923 {
16924 if (i64Len > -16 && i64Len < 16)
16925 return RT_ABS(i64Len);
16926
16927 return 16;
16928 }
16929}
16930
16931/**
16932 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
16933 */
16934static const bool g_afCmpOverride[4][3] =
16935{
16936 /* xmm1 AND xmm2/m128 invalid xmm1 invalid, xmm2/m128 valid xmm1 valid, xmm2/m128 invalid */
16937 { false, false, false }, /* Imm8[3:2] = 00b (equal any) */
16938 { false, false, false }, /* Imm8[3:2] = 01b (ranges) */
16939 { true, false, false }, /* Imm8[3:2] = 10b (equal each) */
16940 { true, true, false }, /* Imm8[3:2] = 11b (equal ordered) */
16941};
16942
16943DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
16944{
16945 if (fSrc1Valid && fSrc2Valid)
16946 return fCmpRes;
16947
16948 uint8_t bSrc1Valid = fSrc1Valid ? 2 : 0;
16949 uint8_t bSrc2Valid = fSrc2Valid ? 1 : 0;
16950 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
16951}
16952
16953static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
16954{
16955 uint8_t bAggOp = (bImm >> 2) & 0x3;
16956 uint16_t u16Result = 0;
16957
16958 switch (bAggOp)
16959 {
16960 case 0: /* Equal any */
16961 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16962 {
16963 uint16_t u16Res = 0;
16964 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
16965 {
16966 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
16967 idxSrc1 < idxLen1,
16968 idxSrc2 < idxLen2,
16969 bAggOp))
16970 {
16971 u16Res = RT_BIT(idxSrc2);
16972 break;
16973 }
16974 }
16975
16976 u16Result |= u16Res;
16977 }
16978 break;
16979
16980 case 1: /* Ranges */
16981 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16982 {
16983 uint16_t u16Res = 0;
16984 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
16985 {
16986 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
16987 idxSrc1 < idxLen1,
16988 idxSrc2 < idxLen2,
16989 bAggOp)
16990 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
16991 (idxSrc1 + 1) < idxLen1,
16992 idxSrc2 < idxLen2,
16993 bAggOp))
16994 {
16995 u16Res = RT_BIT(idxSrc2);
16996 break;
16997 }
16998 }
16999
17000 u16Result |= u16Res;
17001 }
17002 break;
17003
17004 case 2: /* Equal each */
17005 for (uint8_t i = 0; i < cElems; i++)
17006 {
17007 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
17008 i < idxLen1,
17009 i < idxLen2,
17010 bAggOp))
17011 u16Result |= RT_BIT(i);
17012 }
17013 break;
17014
17015 case 3: /* Equal ordered */
17016 u16Result = 0;
17017 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17018 {
17019 uint16_t u16Res = RT_BIT(idxSrc2);
17020 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
17021 {
17022 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
17023 idxSrc1 < idxLen1,
17024 k < idxLen2,
17025 bAggOp))
17026 {
17027 u16Res = 0;
17028 break;
17029 }
17030 }
17031
17032 u16Result |= u16Res;
17033 }
17034 break;
17035 }
17036
17037 /* Polarity selection. */
17038 switch ((bImm >> 4) & 0x3)
17039 {
17040 case 0:
17041 case 2:
17042 /* Nothing to do. */
17043 break;
17044 case 1:
17045 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
17046 break;
17047 case 3:
17048 u16Result ^= RT_BIT(idxLen2) - 1;
17049 break;
17050 default:
17051 AssertReleaseFailed();
17052 }
17053
17054 return u16Result;
17055}
17056
17057DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
17058{
17059 uint32_t fEFlags = 0;
17060
17061 if (u16Result)
17062 fEFlags |= X86_EFL_CF;
17063 if (cLen2 < cElems)
17064 fEFlags |= X86_EFL_ZF;
17065 if (cLen1 < cElems)
17066 fEFlags |= X86_EFL_SF;
17067 if (u16Result & 0x1)
17068 fEFlags |= X86_EFL_OF;
17069 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
17070}
17071
17072DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
17073 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
17074{
17075 bool afCmpRes[16][16];
17076 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17077
17078 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
17079 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
17080 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
17081
17082 return u16Result;
17083}
17084
17085DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17086{
17087 if (bImm & RT_BIT(6))
17088 {
17089 /* Index for MSB set. */
17090 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
17091 if (idxMsb)
17092 *pu32Ecx = idxMsb - 1;
17093 else
17094 *pu32Ecx = cElems;
17095 }
17096 else
17097 {
17098 /* Index for LSB set. */
17099 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
17100 if (idxLsb)
17101 *pu32Ecx = idxLsb - 1;
17102 else
17103 *pu32Ecx = cElems;
17104 }
17105}
17106
17107IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17108{
17109 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17110 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17111 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17112
17113 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17114 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17115}
17116
17117
17118/**
17119 * [V]PCMPESTRI
17120 */
17121IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17122{
17123 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17124 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17125 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17126
17127 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17128 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17129}
17130
17131
17132/**
17133 * [V]PCMPISTRM
17134 */
17135DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17136{
17137 if (bImm & RT_BIT(6))
17138 {
17139 /* Generate a mask. */
17140 if (cElems == 8)
17141 {
17142 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17143 if (u16Result & RT_BIT(i))
17144 puDst->au16[i] = 0xffff;
17145 else
17146 puDst->au16[i] = 0;
17147 }
17148 else
17149 {
17150 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17151 if (u16Result & RT_BIT(i))
17152 puDst->au8[i] = 0xff;
17153 else
17154 puDst->au8[i] = 0;
17155 }
17156 }
17157 else
17158 {
17159 /* Store the result. */
17160 puDst->au64[0] = u16Result;
17161 puDst->au64[1] = 0;
17162 }
17163}
17164
17165IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17166{
17167 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17168 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17169 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17170
17171 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17172 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
17173}
17174
17175
17176/**
17177 * [V]PCMPESTRM
17178 */
17179IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17180{
17181 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17182 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17183 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17184
17185 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17186 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
17187}
17188
17189
17190/*
17191 * [V]PCLMULQDQ
17192 */
17193IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17194{
17195 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
17196}
17197
17198
17199IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17200{
17201 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
17202 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
17203
17204 puDst->au64[0] = 0;
17205 puDst->au64[1] = 0;
17206
17207 /*
17208 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
17209 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
17210 * and squeeze out some optimizations.
17211 */
17212 if (uSrc1 & 0x1)
17213 puDst->au64[0] = uSrc2;
17214
17215 uSrc1 >>= 1;
17216
17217 uint8_t iDigit = 1;
17218 while (uSrc1)
17219 {
17220 if (uSrc1 & 0x1)
17221 {
17222 puDst->au64[0] ^= (uSrc2 << iDigit);
17223 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
17224 }
17225
17226 uSrc1 >>= 1;
17227 iDigit++;
17228 }
17229}
17230
17231
17232/**
17233 * [V]PINSRW
17234 */
17235#ifdef IEM_WITHOUT_ASSEMBLY
17236IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
17237{
17238 uint8_t cShift = (bEvil & 0x3) * 16;
17239 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
17240}
17241
17242
17243IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
17244{
17245 puDst->au16[bEvil & 0x7] = u16Src;
17246}
17247#endif
17248
17249
17250IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
17251{
17252 *puDst = *puSrc;
17253 puDst->au16[bEvil & 0x7] = u16Src;
17254}
17255
17256
17257/**
17258 * [V]PEXTRW
17259 */
17260#ifdef IEM_WITHOUT_ASSEMBLY
17261IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
17262{
17263 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
17264}
17265
17266
17267IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
17268{
17269 *pu16Dst = puSrc->au16[bEvil & 0x7];
17270}
17271
17272#endif
17273
17274IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
17275{
17276 *pu16Dst = puSrc->au16[bEvil & 0x7];
17277}
17278
17279
17280/**
17281 * [V]MOVMSKPS
17282 */
17283#ifdef IEM_WITHOUT_ASSEMBLY
17284IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17285{
17286 *pu8Dst = puSrc->au32[0] >> 31;
17287 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17288 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17289 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17290}
17291
17292#endif
17293
17294IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17295{
17296 *pu8Dst = puSrc->au32[0] >> 31;
17297 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17298 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17299 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17300}
17301
17302
17303IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17304{
17305 *pu8Dst = puSrc->au32[0] >> 31;
17306 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17307 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17308 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17309 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
17310 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
17311 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
17312 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
17313}
17314
17315
17316/**
17317 * [V]MOVMSKPD
17318 */
17319#ifdef IEM_WITHOUT_ASSEMBLY
17320IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17321{
17322 *pu8Dst = puSrc->au64[0] >> 63;
17323 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17324}
17325
17326#endif
17327
17328IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17329{
17330 *pu8Dst = puSrc->au64[0] >> 63;
17331 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17332}
17333
17334
17335IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17336{
17337 *pu8Dst = puSrc->au64[0] >> 63;
17338 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17339 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
17340 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
17341}
17342
17343
17344/**
17345 * CVTTSD2SI
17346 */
17347#ifdef IEM_WITHOUT_ASSEMBLY
17348IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17349{
17350 RTFLOAT64U r64Src;
17351
17352 r64Src.u = *pu64Src;
17353 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17354
17355 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17356 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17357 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17358}
17359
17360
17361IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17362{
17363 RTFLOAT64U r64Src;
17364
17365 r64Src.u = *pu64Src;
17366 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17367
17368 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17369 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17370 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17371}
17372#endif
17373
17374
17375/**
17376 * CVTSD2SI
17377 */
17378#ifdef IEM_WITHOUT_ASSEMBLY
17379IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17380{
17381 RTFLOAT64U r64Src;
17382
17383 r64Src.u = *pu64Src;
17384 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17385
17386 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17387 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17388 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17389}
17390
17391
17392IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17393{
17394 RTFLOAT64U r64Src;
17395
17396 r64Src.u = *pu64Src;
17397 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17398
17399 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17400 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17401 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17402}
17403#endif
17404
17405
17406/**
17407 * CVTTSS2SI
17408 */
17409#ifdef IEM_WITHOUT_ASSEMBLY
17410IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17411{
17412 RTFLOAT32U r32Src;
17413
17414 r32Src.u = *pu32Src;
17415 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17416
17417 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17418 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17419 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17420}
17421
17422
17423IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17424{
17425 RTFLOAT32U r32Src;
17426
17427 r32Src.u = *pu32Src;
17428 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17429
17430 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17431 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17432 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17433}
17434#endif
17435
17436
17437/**
17438 * CVTSS2SI
17439 */
17440#ifdef IEM_WITHOUT_ASSEMBLY
17441IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17442{
17443 RTFLOAT32U r32Src;
17444
17445 r32Src.u = *pu32Src;
17446 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17447
17448 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17449 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17450 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17451}
17452
17453
17454IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17455{
17456 RTFLOAT32U r32Src;
17457
17458 r32Src.u = *pu32Src;
17459 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17460
17461 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17462 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17463 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17464}
17465#endif
17466
17467
17468/**
17469 * CVTSI2SD
17470 */
17471#ifdef IEM_WITHOUT_ASSEMBLY
17472IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
17473{
17474 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17475 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
17476 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17477}
17478
17479
17480IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
17481{
17482 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17483 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
17484 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17485}
17486#endif
17487
17488
17489/**
17490 * CVTSI2SS
17491 */
17492#ifdef IEM_WITHOUT_ASSEMBLY
17493IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
17494{
17495 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17496 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
17497 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17498}
17499
17500
17501IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
17502{
17503 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17504 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
17505 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17506}
17507#endif
17508
17509
17510/**
17511 * [V]UCOMISS
17512 */
17513#ifdef IEM_WITHOUT_ASSEMBLY
17514IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17515{
17516 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17517
17518 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
17519 {
17520 *pfMxcsr |= X86_MXCSR_IE;
17521 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17522 }
17523 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17524 {
17525 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17526 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17527 }
17528 else
17529 {
17530 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17531
17532 RTFLOAT32U r32Src1, r32Src2;
17533 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17534 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17535
17536 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17537 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17538 if (f32_eq(f32Src1, f32Src2, &SoftState))
17539 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17540 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17541 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17542 /* else: GREATER_THAN 000 */
17543
17544 *pfMxcsr |= fDe;
17545 }
17546
17547 *pfEFlags = fEFlagsNew;
17548}
17549#endif
17550
17551IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17552{
17553 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17554}
17555
17556
17557/**
17558 * [V]UCOMISD
17559 */
17560#ifdef IEM_WITHOUT_ASSEMBLY
17561IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17562{
17563 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17564
17565 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
17566 {
17567 *pfMxcsr |= X86_MXCSR_IE;
17568 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17569 }
17570 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17571 {
17572 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17573 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17574 }
17575 else
17576 {
17577 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17578
17579 RTFLOAT64U r64Src1, r64Src2;
17580 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17581 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17582
17583 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17584 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17585 if (f64_eq(f64Src1, f64Src2, &SoftState))
17586 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17587 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17588 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17589 /* else: GREATER_THAN 000 */
17590
17591 *pfMxcsr |= fDe;
17592 }
17593
17594 *pfEFlags = fEFlagsNew;
17595}
17596#endif
17597
17598IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17599{
17600 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17601}
17602
17603
17604/**
17605 * [V]COMISS
17606 */
17607#ifdef IEM_WITHOUT_ASSEMBLY
17608IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17609{
17610 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17611
17612 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
17613 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17614 {
17615 *pfMxcsr |= X86_MXCSR_IE;
17616 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17617 }
17618 else
17619 {
17620 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17621
17622 RTFLOAT32U r32Src1, r32Src2;
17623 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17624 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17625
17626 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17627 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17628 if (f32_eq(f32Src1, f32Src2, &SoftState))
17629 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17630 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17631 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17632 /* else: GREATER_THAN 000 */
17633
17634 *pfMxcsr |= fDe;
17635 }
17636
17637 *pfEFlags = fEFlagsNew;
17638}
17639#endif
17640
17641
17642IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17643{
17644 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17645}
17646
17647
17648/**
17649 * [V]COMISD
17650 */
17651#ifdef IEM_WITHOUT_ASSEMBLY
17652IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17653{
17654 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17655
17656 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
17657 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17658 {
17659 *pfMxcsr |= X86_MXCSR_IE;
17660 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17661 }
17662 else
17663 {
17664 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17665
17666 RTFLOAT64U r64Src1, r64Src2;
17667 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17668 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17669
17670 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17671 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17672 if (f64_eq(f64Src1, f64Src2, &SoftState))
17673 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17674 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17675 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17676 /* else: GREATER_THAN 000 */
17677
17678 *pfMxcsr |= fDe;
17679 }
17680
17681 *pfEFlags = fEFlagsNew;
17682}
17683#endif
17684
17685IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17686{
17687 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17688}
17689
17690
17691/**
17692 * CMPPS / CMPPD / CMPSS / CMPSD
17693 */
17694#ifdef IEM_WITHOUT_ASSEMBLY
17695/**
17696 * A compare truth table entry.
17697 */
17698typedef struct CMPTRUTHTBLENTRY
17699{
17700 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
17701 bool fSignalsOnQNan;
17702 /** The boolean result when the input operands are unordered. */
17703 bool fUnordered;
17704 /** The boolean result when A = B. */
17705 bool fEqual;
17706 /** The boolean result when A < B. */
17707 bool fLowerThan;
17708 /** The boolean result when A > B. */
17709 bool fGreaterThan;
17710} CMPTRUTHTBLENTRY;
17711/** Pointer to a const truth table entry. */
17712typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
17713
17714
17715/** The compare truth table (indexed by immediate). */
17716static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
17717{
17718 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
17719 /* 00H (EQ_OQ) */ { false, false, true, false, false },
17720 /* 01H (LT_OS) */ { true, false, false, true, false },
17721 /* 02H (LE_OS) */ { true, false, true, true, false },
17722 /* 03H (UNORD_Q) */ { false, true, false, false, false },
17723 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
17724 /* 05H (NLT_US) */ { true, true, true, false, true },
17725 /* 06H (NLE_US) */ { true, true, false, false, true },
17726 /* 07H (ORQ_Q) */ { false, false, true, true, true },
17727 /** @todo AVX variants. */
17728};
17729
17730
17731static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
17732{
17733 bool fRes;
17734 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17735
17736 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
17737 {
17738 *pfMxcsr |= X86_MXCSR_IE;
17739 fRes = g_aCmpTbl[bEvil].fUnordered;
17740 }
17741 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
17742 {
17743 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17744 *pfMxcsr |= X86_MXCSR_IE;
17745 fRes = g_aCmpTbl[bEvil].fUnordered;
17746 }
17747 else
17748 {
17749 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17750
17751 RTFLOAT32U r32Src1, r32Src2;
17752 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
17753 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
17754
17755 *pfMxcsr |= fDe;
17756 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17757 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17758 if (f32_eq(f32Src1, f32Src2, &SoftState))
17759 fRes = g_aCmpTbl[bEvil].fEqual;
17760 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17761 fRes = g_aCmpTbl[bEvil].fLowerThan;
17762 else
17763 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17764 }
17765
17766 return fRes;
17767}
17768
17769
17770static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
17771{
17772 bool fRes;
17773 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17774
17775 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
17776 {
17777 *pfMxcsr |= X86_MXCSR_IE;
17778 fRes = g_aCmpTbl[bEvil].fUnordered;
17779 }
17780 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
17781 {
17782 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17783 *pfMxcsr |= X86_MXCSR_IE;
17784 fRes = g_aCmpTbl[bEvil].fUnordered;
17785 }
17786 else
17787 {
17788 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17789
17790 RTFLOAT64U r64Src1, r64Src2;
17791 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
17792 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
17793
17794 *pfMxcsr |= fDe;
17795 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17796 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17797 if (f64_eq(f64Src1, f64Src2, &SoftState))
17798 fRes = g_aCmpTbl[bEvil].fEqual;
17799 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17800 fRes = g_aCmpTbl[bEvil].fLowerThan;
17801 else
17802 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17803 }
17804
17805 return fRes;
17806}
17807
17808
17809IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17810{
17811 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17812 {
17813 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
17814 puDst->au32[i] = UINT32_MAX;
17815 else
17816 puDst->au32[i] = 0;
17817 }
17818}
17819
17820
17821IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17822{
17823 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17824 {
17825 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
17826 puDst->au64[i] = UINT64_MAX;
17827 else
17828 puDst->au64[i] = 0;
17829 }
17830}
17831
17832
17833IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17834{
17835 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
17836 puDst->au32[0] = UINT32_MAX;
17837 else
17838 puDst->au32[0] = 0;
17839
17840 puDst->au32[1] = pSrc->uSrc1.au32[1];
17841 puDst->au64[1] = pSrc->uSrc1.au64[1];
17842}
17843
17844
17845IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17846{
17847 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
17848 puDst->au64[0] = UINT64_MAX;
17849 else
17850 puDst->au64[0] = 0;
17851
17852 puDst->au64[1] = pSrc->uSrc1.au64[1];
17853}
17854#endif
17855
17856
17857/**
17858 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
17859 */
17860
17861#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
17862#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
17863#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
17864
17865#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
17866
17867DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
17868{
17869 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
17870 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17871
17872 fMxcsr &= ~X86_MXCSR_RC_MASK;
17873 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
17874 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17875}
17876
17877static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
17878{
17879 RTFLOAT32U r32Src, r32Dst;
17880 float32_t f32Src;
17881 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
17882 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
17883
17884 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
17885 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
17886
17887 iemFpSoftF32ToIprt(&r32Dst, f32Src);
17888 return r32Dst;
17889}
17890
17891static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
17892{
17893 RTFLOAT64U r64Src, r64Dst;
17894 float64_t f64Src;
17895 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
17896 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
17897
17898 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
17899 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
17900
17901 iemFpSoftF64ToIprt(&r64Dst, f64Src);
17902 return r64Dst;
17903}
17904
17905#ifdef IEM_WITHOUT_ASSEMBLY
17906IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17907{
17908 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17909 puDst->au32[1] = pSrc->uSrc1.au32[1];
17910 puDst->au64[1] = pSrc->uSrc1.au64[1];
17911}
17912
17913
17914IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17915{
17916 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17917 puDst->au64[1] = pSrc->uSrc1.au64[1];
17918}
17919#endif
17920
17921IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17922{
17923 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17924 {
17925 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17926 }
17927}
17928
17929
17930IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17931{
17932 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17933 {
17934 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17935 }
17936}
17937
17938/**
17939 * CVTPD2PI
17940 */
17941#ifdef IEM_WITHOUT_ASSEMBLY
17942static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17943{
17944 RTFLOAT64U r64Src;
17945 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17946
17947 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17948 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17949 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17950}
17951
17952
17953IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17954{
17955 RTUINT64U u64Res;
17956 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17957 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17958
17959 *pu64Dst = u64Res.u;
17960 *pfMxcsr = fMxcsrOut;
17961}
17962#endif
17963
17964
17965/**
17966 * CVTTPD2PI
17967 */
17968#ifdef IEM_WITHOUT_ASSEMBLY
17969static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17970{
17971 RTFLOAT64U r64Src;
17972 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17973
17974 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17975 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17976 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17977}
17978
17979
17980IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17981{
17982 RTUINT64U u64Res;
17983 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17984 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17985
17986 *pu64Dst = u64Res.u;
17987 *pfMxcsr = fMxcsrOut;
17988}
17989#endif
17990
17991
17992/**
17993 * CVTPI2PS
17994 */
17995#ifdef IEM_WITHOUT_ASSEMBLY
17996static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
17997{
17998 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17999 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
18000 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
18001}
18002
18003
18004IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
18005{
18006 RTUINT64U uSrc = { u64Src };
18007 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
18008 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
18009 *pfMxcsr = fMxcsrOut;
18010}
18011#endif
18012
18013
18014/**
18015 * CVTPI2PD
18016 */
18017#ifdef IEM_WITHOUT_ASSEMBLY
18018static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
18019{
18020 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18021 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
18022 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
18023}
18024
18025
18026IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
18027{
18028 RTUINT64U uSrc = { u64Src };
18029 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
18030 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
18031 *pfMxcsr = fMxcsrOut;
18032}
18033#endif
18034
18035
18036/**
18037 * CVTPS2PI
18038 */
18039#ifdef IEM_WITHOUT_ASSEMBLY
18040static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18041{
18042 RTFLOAT32U r32Src;
18043 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18044
18045 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18046 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18047 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18048}
18049
18050
18051IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
18052{
18053 RTUINT64U uDst;
18054 RTUINT64U uSrc = { u64Src };
18055 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18056 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18057 *pu64Dst = uDst.u;
18058 *pfMxcsr = fMxcsrOut;
18059}
18060#endif
18061
18062
18063/**
18064 * CVTTPS2PI
18065 */
18066#ifdef IEM_WITHOUT_ASSEMBLY
18067static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18068{
18069 RTFLOAT32U r32Src;
18070 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18071
18072 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18073 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18074 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18075}
18076
18077
18078IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
18079{
18080 RTUINT64U uDst;
18081 RTUINT64U uSrc = { u64Src };
18082 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18083 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18084 *pu64Dst = uDst.u;
18085 *pfMxcsr = fMxcsrOut;
18086}
18087#endif
18088
18089/**
18090 * RDRAND
18091 */
18092IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18093{
18094 *puDst = 0;
18095 *pEFlags &= ~X86_EFL_STATUS_BITS;
18096 *pEFlags |= X86_EFL_CF;
18097}
18098
18099IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18100{
18101 *puDst = 0;
18102 *pEFlags &= ~X86_EFL_STATUS_BITS;
18103 *pEFlags |= X86_EFL_CF;
18104}
18105
18106IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18107{
18108 *puDst = 0;
18109 *pEFlags &= ~X86_EFL_STATUS_BITS;
18110 *pEFlags |= X86_EFL_CF;
18111}
18112
18113/**
18114 * RDSEED
18115 */
18116IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18117{
18118 *puDst = 0;
18119 *pEFlags &= ~X86_EFL_STATUS_BITS;
18120 *pEFlags |= X86_EFL_CF;
18121}
18122
18123IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18124{
18125 *puDst = 0;
18126 *pEFlags &= ~X86_EFL_STATUS_BITS;
18127 *pEFlags |= X86_EFL_CF;
18128}
18129
18130IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18131{
18132 *puDst = 0;
18133 *pEFlags &= ~X86_EFL_STATUS_BITS;
18134 *pEFlags |= X86_EFL_CF;
18135}
18136
18137
18138/**
18139 * SHA1NEXTE
18140 */
18141IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18142{
18143 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
18144
18145 puDst->au32[0] = puSrc->au32[0];
18146 puDst->au32[1] = puSrc->au32[1];
18147 puDst->au32[2] = puSrc->au32[2];
18148 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
18149}
18150
18151/**
18152 * SHA1MSG1
18153 */
18154IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18155{
18156 uint32_t u32W0 = puDst->au32[3];
18157 uint32_t u32W1 = puDst->au32[2];
18158 uint32_t u32W2 = puDst->au32[1];
18159 uint32_t u32W3 = puDst->au32[0];
18160 uint32_t u32W4 = puSrc->au32[3];
18161 uint32_t u32W5 = puSrc->au32[2];
18162
18163 puDst->au32[3] = u32W2 ^ u32W0;
18164 puDst->au32[2] = u32W3 ^ u32W1;
18165 puDst->au32[1] = u32W4 ^ u32W2;
18166 puDst->au32[0] = u32W5 ^ u32W3;
18167}
18168
18169/**
18170 * SHA1MSG2
18171 */
18172IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18173{
18174 uint32_t u32W13 = puSrc->au32[2];
18175 uint32_t u32W14 = puSrc->au32[1];
18176 uint32_t u32W15 = puSrc->au32[0];
18177 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
18178 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
18179 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
18180 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
18181
18182 puDst->au32[3] = u32W16;
18183 puDst->au32[2] = u32W17;
18184 puDst->au32[1] = u32W18;
18185 puDst->au32[0] = u32W19;
18186}
18187
18188/**
18189 * SHA1RNDS4
18190 */
18191typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
18192typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
18193
18194static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18195{
18196 return (u32B & u32C) ^ (~u32B & u32D);
18197}
18198
18199static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18200{
18201 return u32B ^ u32C ^ u32D;
18202}
18203
18204static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18205{
18206 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
18207}
18208
18209static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18210{
18211 return u32B ^ u32C ^ u32D;
18212}
18213
18214IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18215{
18216 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
18217 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
18218
18219 uint32_t au32A[5];
18220 uint32_t au32B[5];
18221 uint32_t au32C[5];
18222 uint32_t au32D[5];
18223 uint32_t au32E[5];
18224 uint32_t au32W[4];
18225 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
18226 uint32_t u32K = s_au32K[bEvil & 0x3];
18227
18228 au32A[0] = puDst->au32[3];
18229 au32B[0] = puDst->au32[2];
18230 au32C[0] = puDst->au32[1];
18231 au32D[0] = puDst->au32[0];
18232 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
18233 au32W[i] = puSrc->au32[3 - i];
18234
18235 /* Round 0 is a bit different than the other rounds. */
18236 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
18237 au32B[1] = au32A[0];
18238 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
18239 au32D[1] = au32C[0];
18240 au32E[1] = au32D[0];
18241
18242 for (uint32_t i = 1; i <= 3; i++)
18243 {
18244 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
18245 au32B[i + 1] = au32A[i];
18246 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
18247 au32D[i + 1] = au32C[i];
18248 au32E[i + 1] = au32D[i];
18249 }
18250
18251 puDst->au32[3] = au32A[4];
18252 puDst->au32[2] = au32B[4];
18253 puDst->au32[1] = au32C[4];
18254 puDst->au32[0] = au32D[4];
18255}
18256
18257
18258/**
18259 * SHA256MSG1
18260 */
18261DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
18262{
18263 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
18264}
18265
18266IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18267{
18268 uint32_t u32W4 = puSrc->au32[0];
18269 uint32_t u32W3 = puDst->au32[3];
18270 uint32_t u32W2 = puDst->au32[2];
18271 uint32_t u32W1 = puDst->au32[1];
18272 uint32_t u32W0 = puDst->au32[0];
18273
18274 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
18275 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
18276 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
18277 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
18278}
18279
18280/**
18281 * SHA256MSG2
18282 */
18283DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
18284{
18285 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
18286}
18287
18288IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18289{
18290 uint32_t u32W14 = puSrc->au32[2];
18291 uint32_t u32W15 = puSrc->au32[3];
18292 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
18293 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
18294 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
18295 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
18296
18297 puDst->au32[3] = u32W19;
18298 puDst->au32[2] = u32W18;
18299 puDst->au32[1] = u32W17;
18300 puDst->au32[0] = u32W16;
18301}
18302
18303/**
18304 * SHA256RNDS2
18305 */
18306DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18307{
18308 return (u32X & u32Y) ^ (~u32X & u32Z);
18309}
18310
18311DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18312{
18313 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
18314}
18315
18316DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
18317{
18318 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
18319}
18320
18321DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
18322{
18323 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
18324}
18325
18326IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
18327{
18328 uint32_t au32A[3];
18329 uint32_t au32B[3];
18330 uint32_t au32C[3];
18331 uint32_t au32D[3];
18332 uint32_t au32E[3];
18333 uint32_t au32F[3];
18334 uint32_t au32G[3];
18335 uint32_t au32H[3];
18336 uint32_t au32WK[2];
18337
18338 au32A[0] = puSrc->au32[3];
18339 au32B[0] = puSrc->au32[2];
18340 au32C[0] = puDst->au32[3];
18341 au32D[0] = puDst->au32[2];
18342 au32E[0] = puSrc->au32[1];
18343 au32F[0] = puSrc->au32[0];
18344 au32G[0] = puDst->au32[1];
18345 au32H[0] = puDst->au32[0];
18346
18347 au32WK[0] = puXmm0Constants->au32[0];
18348 au32WK[1] = puXmm0Constants->au32[1];
18349
18350 for (uint32_t i = 0; i < 2; i++)
18351 {
18352 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18353 + iemAImpl_sha256_upper_sigma1(au32E[i])
18354 + au32WK[i]
18355 + au32H[i]
18356 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
18357 + iemAImpl_sha256_upper_sigma0(au32A[i]);
18358 au32B[i + 1] = au32A[i];
18359 au32C[i + 1] = au32B[i];
18360 au32D[i + 1] = au32C[i];
18361 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18362 + iemAImpl_sha256_upper_sigma1(au32E[i])
18363 + au32WK[i]
18364 + au32H[i]
18365 + au32D[i];
18366 au32F[i + 1] = au32E[i];
18367 au32G[i + 1] = au32F[i];
18368 au32H[i + 1] = au32G[i];
18369 }
18370
18371 puDst->au32[3] = au32A[2];
18372 puDst->au32[2] = au32B[2];
18373 puDst->au32[1] = au32E[2];
18374 puDst->au32[0] = au32F[2];
18375}
18376
18377
18378/**
18379 * ADCX
18380 */
18381#define ADX_EMIT(a_Flag, a_Type, a_Max) \
18382 do \
18383 { \
18384 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
18385 a_Type uTmp = *puDst + uSrc; \
18386 if (uTmp < uSrc) \
18387 *pfEFlags |= (a_Flag); \
18388 else \
18389 *pfEFlags &= ~(a_Flag); \
18390 if ( uTmp == a_Max \
18391 && f) \
18392 *pfEFlags |= (a_Flag); \
18393 if (f) \
18394 uTmp++; \
18395 *puDst = uTmp; \
18396 } \
18397 while (0)
18398
18399IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t *pfEFlags, uint32_t uSrc))
18400{
18401 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
18402}
18403
18404IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint32_t *pfEFlags, uint64_t uSrc))
18405{
18406 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
18407}
18408
18409
18410/**
18411 * ADOX
18412 */
18413IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t *pfEFlags, uint32_t uSrc))
18414{
18415 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
18416}
18417
18418IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint32_t *pfEFlags, uint64_t uSrc))
18419{
18420 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
18421}
18422
18423
18424/**
18425 * MPSADBW
18426 */
18427IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18428{
18429 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18430 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18431 int16_t ai16Src1[11];
18432 int16_t ai16Src2[4];
18433
18434 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18435 ai16Src1[i] = puDst->au8[idxSrc1 + i];
18436
18437 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18438 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
18439
18440 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18441 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18442 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18443 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18444 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18445}
18446
18447
18448IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18449{
18450 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18451 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18452 int16_t ai16Src1[11];
18453 int16_t ai16Src2[4];
18454
18455 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18456 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
18457
18458 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18459 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
18460
18461 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18462 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18463 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18464 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18465 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18466}
18467
18468
18469IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
18470{
18471 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
18472 RTUINT256U const uSrc2 = *puSrc2;
18473 ASMCompilerBarrier();
18474 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
18475 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
18476}
18477
18478
18479/**
18480 * VPERM2I128
18481 */
18482IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
18483{
18484 if (bImm & RT_BIT(3))
18485 {
18486 puDst->au64[0] = 0;
18487 puDst->au64[1] = 0;
18488 }
18489 else
18490 {
18491 switch (bImm & 0x3)
18492 {
18493 case 0:
18494 puDst->au64[0] = puSrc1->au64[0];
18495 puDst->au64[1] = puSrc1->au64[1];
18496 break;
18497 case 1:
18498 puDst->au64[0] = puSrc1->au64[2];
18499 puDst->au64[1] = puSrc1->au64[3];
18500 break;
18501 case 2:
18502 puDst->au64[0] = puSrc2->au64[0];
18503 puDst->au64[1] = puSrc2->au64[1];
18504 break;
18505 case 3:
18506 puDst->au64[0] = puSrc2->au64[2];
18507 puDst->au64[1] = puSrc2->au64[3];
18508 break;
18509 }
18510 }
18511
18512 if (bImm & RT_BIT(7))
18513 {
18514 puDst->au64[2] = 0;
18515 puDst->au64[3] = 0;
18516 }
18517 else
18518 {
18519 switch ((bImm >> 4) & 0x3)
18520 {
18521 case 0:
18522 puDst->au64[2] = puSrc1->au64[0];
18523 puDst->au64[3] = puSrc1->au64[1];
18524 break;
18525 case 1:
18526 puDst->au64[2] = puSrc1->au64[2];
18527 puDst->au64[3] = puSrc1->au64[3];
18528 break;
18529 case 2:
18530 puDst->au64[2] = puSrc2->au64[0];
18531 puDst->au64[3] = puSrc2->au64[1];
18532 break;
18533 case 3:
18534 puDst->au64[2] = puSrc2->au64[2];
18535 puDst->au64[3] = puSrc2->au64[3];
18536 break;
18537 }
18538 }
18539}
18540
18541
18542/**
18543 * VPERM2F128
18544 */
18545IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
18546{
18547 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
18548}
18549
18550
18551/**
18552 * DPPS
18553 */
18554IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18555{
18556 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18557 AssertReleaseFailed();
18558}
18559
18560
18561/**
18562 * DPPD
18563 */
18564IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18565{
18566 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18567 AssertReleaseFailed();
18568}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette